From 7ff148e2ca320144da0522235295884000107c34 Mon Sep 17 00:00:00 2001 From: Ian Tenney Date: Sun, 21 Jul 2019 20:14:27 -0700 Subject: [PATCH] Update edge probing runner and tokenization scripts (#858) * Update NFS paths * Remove unused experiment scripts * Fix edge probing configs * Add username to Kubernetes jobs Plays nicer on multi-user environments, now everything is written to /nfs/jiant/exp/$USER/$PROJECT_NAME and jobs are prefixed with $USER * Move args to top of retokenize script for better documentation * Update retokenize_bert.sh * Update edge probing experiment script * Add username to job name and project dir for Kubernetes runs Better default behavior on a multi-user cluster. * Formatter --- config/edgeprobe/edgeprobe_bare.conf | 2 - probing/retokenize_bert.sh | 35 +++++++++------- probing/retokenize_edge_data.py | 16 ++++--- scripts/edgeprobing/exp_fns.sh | 26 +++++++----- scripts/edgeprobing/kubernetes_run_all.sh | 38 ++++++++++------- scripts/edgeprobing/ortho_elmo_baselines.sh | 44 -------------------- scripts/edgeprobing/random_elmo_baselines.sh | 44 -------------------- scripts/edgeprobing/train_chars_baselines.sh | 36 ---------------- scripts/edgeprobing/train_full_baselines.sh | 36 ---------------- 9 files changed, 66 insertions(+), 211 deletions(-) delete mode 100755 scripts/edgeprobing/ortho_elmo_baselines.sh delete mode 100755 scripts/edgeprobing/random_elmo_baselines.sh delete mode 100755 scripts/edgeprobing/train_chars_baselines.sh delete mode 100755 scripts/edgeprobing/train_full_baselines.sh diff --git a/config/edgeprobe/edgeprobe_bare.conf b/config/edgeprobe/edgeprobe_bare.conf index d2e8c05e8..f2ddcb396 100644 --- a/config/edgeprobe/edgeprobe_bare.conf +++ b/config/edgeprobe/edgeprobe_bare.conf @@ -24,8 +24,6 @@ lr_patience = 5 // vals until LR decay patience = 20 // vals until early-stopping tokenizer = "" // use native tokenization with ELMo -elmo = 1 -elmo_chars_only = 1 // set to 0 to use full ELMo // Use no-op encoder (no params). sent_enc = "none" diff --git a/probing/retokenize_bert.sh b/probing/retokenize_bert.sh index e364e6dcf..f8d63f874 100755 --- a/probing/retokenize_bert.sh +++ b/probing/retokenize_bert.sh @@ -2,34 +2,39 @@ set -eu -EDGE_DATA_PATH="/nfs/jsalt/share/glue_data/edges" +EDGE_DATA_PATH="$JIANT_DATA_DIR/edges" +echo "Processing edge probing data in $EDGE_DATA_PATH" declare -a SUBPATHS -SUBPATHS+=( "ontonotes-constituents" ) -SUBPATHS+=( "dep_ewt" ) -SUBPATHS+=( "ontonotes-ner" ) -SUBPATHS+=( "srl_conll2012" ) -SUBPATHS+=( "ontonotes-coref-conll" ) SUBPATHS+=( "spr1" ) SUBPATHS+=( "spr2" ) SUBPATHS+=( "dpr" ) +SUBPATHS+=( "dep_ewt" ) +SUBPATHS+=( "ontonotes/const/pos" ) +SUBPATHS+=( "ontonotes/const/nonterminal" ) +SUBPATHS+=( "ontonotes/srl" ) +SUBPATHS+=( "ontonotes/ner" ) +SUBPATHS+=( "ontonotes/coref" ) +SUBPATHS+=( "semeval" ) +SUBPATHS+=( "tacred/rel" ) +SUBPATHS+=( "noun_verb" ) for subpath in "${SUBPATHS[@]}"; do - python $(dirname $0)/retokenize_edge_data.bert.py \ - --model bert-base-uncased $EDGE_DATA_PATH/$subpath/*.json & - python $(dirname $0)/retokenize_edge_data.bert.py \ - --model bert-large-uncased $EDGE_DATA_PATH/$subpath/*.json & + python $(dirname $0)/retokenize_edge_data.py \ + -t bert-base-uncased $EDGE_DATA_PATH/$subpath/*.json & + python $(dirname $0)/retokenize_edge_data.py \ + -t bert-large-uncased $EDGE_DATA_PATH/$subpath/*.json & done # exit 0 # Only use the cased model on NER, per https://arxiv.org/pdf/1810.04805.pdf -CASED_SUBPATHS=( "ontonotes-ner" ) +CASED_SUBPATHS=( "ontonotes/ner" ) for subpath in "${CASED_SUBPATHS[@]}"; do - python $(dirname $0)/retokenize_edge_data.bert.py \ - --model bert-base-cased $EDGE_DATA_PATH/$subpath/*.json & - python $(dirname $0)/retokenize_edge_data.bert.py \ - --model bert-large-cased $EDGE_DATA_PATH/$subpath/*.json & + python $(dirname $0)/retokenize_edge_data.py \ + -t bert-base-cased $EDGE_DATA_PATH/$subpath/*.json & + python $(dirname $0)/retokenize_edge_data.py \ + -t bert-large-cased $EDGE_DATA_PATH/$subpath/*.json & done diff --git a/probing/retokenize_edge_data.py b/probing/retokenize_edge_data.py index b967d9216..83f19b5a6 100755 --- a/probing/retokenize_edge_data.py +++ b/probing/retokenize_edge_data.py @@ -41,6 +41,12 @@ log.basicConfig(format="%(asctime)s: %(message)s", datefmt="%m/%d %I:%M:%S %p", level=log.INFO) +PARSER = argparse.ArgumentParser() +PARSER.add_argument("-t", dest="tokenizer_name", type=str, required=True, help="Tokenizer name.") +PARSER.add_argument( + "--num_parallel", type=int, default=4, help="Number of parallel processes to use." +) +PARSER.add_argument("inputs", type=str, nargs="+", help="Input JSON files.") # For now, this module expects MosesTokenizer as the default. # TODO: change this once we have better support in core utils. @@ -81,15 +87,7 @@ def retokenize_file(fname, tokenizer_name, worker_pool): def main(args): - parser = argparse.ArgumentParser() - parser.add_argument( - "-t", dest="tokenizer_name", type=str, required=True, help="Tokenizer name." - ) - parser.add_argument( - "--num_parallel", type=int, default=4, help="Number of parallel processes to use." - ) - parser.add_argument("inputs", type=str, nargs="+", help="Input JSON files.") - args = parser.parse_args(args) + args = PARSER.parse_args(args) worker_pool = multiprocessing.Pool(args.num_parallel) for fname in args.inputs: diff --git a/scripts/edgeprobing/exp_fns.sh b/scripts/edgeprobing/exp_fns.sh index 2768e6686..30aef385b 100644 --- a/scripts/edgeprobing/exp_fns.sh +++ b/scripts/edgeprobing/exp_fns.sh @@ -14,6 +14,8 @@ # # See individual functions below for usage. +EP_RESOURCE_DIR="/nfs/jiant/share/edge-probing/resources" + function run_exp() { # Helper function to invoke main.py. # Don't run this directly - use the experiment functions below, @@ -34,7 +36,7 @@ function elmo_chars_exp() { # Lexical baseline, probe ELMo char CNN layer. # Usage: elmo_chars_exp OVERRIDES="exp_name=elmo-chars-$1, run_name=run" - OVERRIDES+=", target_tasks=$1" + OVERRIDES+=", target_tasks=$1, input_module=elmo-chars-only" run_exp "config/edgeprobe/edgeprobe_bare.conf" "${OVERRIDES}" } @@ -42,16 +44,16 @@ function elmo_full_exp() { # Full ELMo, probe full ELMo with learned mixing weights. # Usage: elmo_full_exp OVERRIDES="exp_name=elmo-full-$1, run_name=run" - OVERRIDES+=", target_tasks=$1, elmo_chars_only=0" + OVERRIDES+=", target_tasks=$1, input_module=elmo" run_exp "config/edgeprobe/edgeprobe_bare.conf" "${OVERRIDES}" } function elmo_ortho_exp() { # Full ELMo with random orthogonal weights for LSTM and projections. # Usage: elmo_ortho_exp - ELMO_WEIGHTS_PATH="/nfs/jsalt/share/random_elmo/elmo_2x4096_512_2048cnn_2xhighway_weights_ortho_seed_$2.hdf5" + ELMO_WEIGHTS_PATH="${EP_RESOURCE_DIR}/random_elmo/elmo_2x4096_512_2048cnn_2xhighway_weights_ortho_seed_$2.hdf5" OVERRIDES="exp_name=elmo-ortho-$1, run_name=run_seed_$2" - OVERRIDES+=", target_tasks=$1, elmo_chars_only=0" + OVERRIDES+=", target_tasks=$1, input_module=elmo" OVERRIDES+=", elmo_weight_file_path=${ELMO_WEIGHTS_PATH}" run_exp "config/edgeprobe/edgeprobe_bare.conf" "${OVERRIDES}" } @@ -59,9 +61,9 @@ function elmo_ortho_exp() { function elmo_random_exp() { # Full ELMo with random normal weights for LSTM and projections. # Usage: elmo_random_exp - ELMO_WEIGHTS_PATH="/nfs/jsalt/share/random_elmo/elmo_2x4096_512_2048cnn_2xhighway_weights_random_seed_$2.hdf5" + ELMO_WEIGHTS_PATH="${EP_RESOURCE_DIR}/random_elmo/elmo_2x4096_512_2048cnn_2xhighway_weights_random_seed_$2.hdf5" OVERRIDES="exp_name=elmo-random-$1, run_name=run_seed_$2" - OVERRIDES+=", target_tasks=$1, elmo_chars_only=0" + OVERRIDES+=", target_tasks=$1, input_module=elmo" OVERRIDES+=", elmo_weight_file_path=${ELMO_WEIGHTS_PATH}" run_exp "config/edgeprobe/edgeprobe_bare.conf" "${OVERRIDES}" } @@ -71,6 +73,7 @@ function train_chars_exp() { # Usage: train_chars_exp OVERRIDES="exp_name=train-chars-$1, run_name=run" OVERRIDES+=", pretrain_tasks=$1, max_vals=$2, val_interval=$3" + OVERRIDES+=", input_module=elmo-chars-only" run_exp "config/edgeprobe/edgeprobe_train.conf" "${OVERRIDES}" } @@ -78,7 +81,8 @@ function train_full_exp() { # Trained encoder over full ELMo. # Usage: train_full_exp OVERRIDES="exp_name=train-full-$1, run_name=run" - OVERRIDES+=", pretrain_tasks=$1, max_vals=$2, val_interval=$3, elmo_chars_only=0" + OVERRIDES+=", pretrain_tasks=$1, max_vals=$2, val_interval=$3" + OVERRIDES+=", input_module=elmo" run_exp "config/edgeprobe/edgeprobe_train.conf" "${OVERRIDES}" } @@ -140,7 +144,7 @@ function openai_mix_exp() { function openai_bwb_exp() { # Probe the OpenAI transformer model, as trained on BWB-shuffled. # Usage: openai_bwb_exp - CKPT_PATH="/nfs/jsalt/home/iftenney/checkpoints/bwb_shuffled/model.ckpt-1000000" + CKPT_PATH="${EP_RESOURCE_DIR}/checkpoints/bwb_shuffled/model.ckpt-1000000" OVERRIDES="exp_name=openai-bwb-$1, run_name=run" OVERRIDES+=", target_tasks=$1" OVERRIDES+=", openai_transformer_ckpt=${CKPT_PATH}" @@ -157,7 +161,7 @@ function bert_cat_exp() { # Usage: bert_cat_exp OVERRIDES="exp_name=bert-${2}-cat-${1}, run_name=run" OVERRIDES+=", target_tasks=$1" - OVERRIDES+=", bert_model_name=bert-$2" + OVERRIDES+=", input_module=bert-$2" OVERRIDES+=", bert_embeddings_mode=cat" run_exp "config/edgeprobe/edgeprobe_bert.conf" "${OVERRIDES}" } @@ -167,7 +171,7 @@ function bert_lex_exp() { # Usage: bert_lex_exp OVERRIDES="exp_name=bert-${2}-lex-${1}, run_name=run" OVERRIDES+=", target_tasks=$1" - OVERRIDES+=", bert_model_name=bert-$2" + OVERRIDES+=", input_module=bert-$2" OVERRIDES+=", bert_embeddings_mode=only" run_exp "config/edgeprobe/edgeprobe_bert.conf" "${OVERRIDES}" } @@ -177,7 +181,7 @@ function bert_mix_exp() { # Usage: bert_mix_exp OVERRIDES="exp_name=bert-${2}-mix-${1}, run_name=run" OVERRIDES+=", target_tasks=$1" - OVERRIDES+=", bert_model_name=bert-$2" + OVERRIDES+=", input_module=bert-$2" OVERRIDES+=", bert_embeddings_mode=mix" run_exp "config/edgeprobe/edgeprobe_bert.conf" "${OVERRIDES}" } diff --git a/scripts/edgeprobing/kubernetes_run_all.sh b/scripts/edgeprobing/kubernetes_run_all.sh index afef50c82..1e7f25486 100755 --- a/scripts/edgeprobing/kubernetes_run_all.sh +++ b/scripts/edgeprobing/kubernetes_run_all.sh @@ -12,14 +12,14 @@ set -e # Default arguments. -PROJECT="" +PROJECT_NAME="" NOTIFY_EMAIL="$NOTIFY_EMAIL" # allow pre-set from shell # Handle flags. OPTIND=1 # Reset in case getopts has been used previously in the shell. while getopts ":p:n:" opt; do case "$opt" in - p) PROJECT=$OPTARG + p) PROJECT_NAME=$OPTARG ;; n) NOTIFY_EMAIL=$OPTARG ;; @@ -34,7 +34,7 @@ shift $((OPTIND-1)) # Remaining positional arguments. MODE=${1:-"create"} -if [ -z $PROJECT ]; then +if [ -z $PROJECT_NAME ]; then echo "You must provide a project name!" exit 1 fi @@ -47,8 +47,18 @@ fi # Top-level directory for the current repo. pushd $(git rev-parse --show-toplevel) +# Get the NFS path from the Kubernetes config, so that it doesn't need to be +# hardcoded here. +pushd gcp/kubernetes/templates +NFS_EXP_DIR=$(jsonnet -S -e "local env = import 'jiant_env.libsonnet'; env.nfs_exp_dir") +echo "Assuming NFS experiment path at $NFS_EXP_DIR" +popd + # Make a copy of the current tree in the project directory. -PROJECT_DIR="/nfs/jsalt/exp/$PROJECT" +PROJECT_DIR="${NFS_EXP_DIR}/${USER}/${PROJECT_NAME}" +if [ ! -d "${NFS_EXP_DIR}/$USER" ]; then + mkdir "${NFS_EXP_DIR}/$USER" +fi if [ ! -d "${PROJECT_DIR}" ]; then echo "Creating project directory ${PROJECT_DIR}" mkdir ${PROJECT_DIR} @@ -65,7 +75,7 @@ function make_kubernetes_command() { # Uses exp_fns.sh to generate configs; see that file for details # and to define new experiments. echo -n "pushd ${PATH_TO_JIANT}" - echo -n "; source scripts/edges/exp_fns.sh" + echo -n "; source scripts/edgeprobing/exp_fns.sh" echo -n "; $@" } @@ -74,8 +84,9 @@ function kuberun() { NAME=$1 COMMAND=$(make_kubernetes_command $2) echo "Job '$NAME': '$COMMAND'" - ./gcp/kubernetes/run_batch.sh -m $MODE -p ${PROJECT} -g ${GPU_TYPE} \ - -n ${NOTIFY_EMAIL} $NAME "$COMMAND" + ./gcp/kubernetes/run_batch.sh -m $MODE -p ${PROJECT_NAME} -g ${GPU_TYPE} \ + $NAME "$COMMAND" + # -n ${NOTIFY_EMAIL} \ # Temporarily disabled echo "" } @@ -103,14 +114,14 @@ if [[ $MODE == "delete" ]]; then fi ## -# Run these on the main 'jsalt' cluster -gcloud container clusters get-credentials --zone us-east1-c jsalt +# Run these on p100s (default) export GPU_TYPE="p100" for task in "${ALL_TASKS[@]}" do - kuberun elmo-chars-$task "elmo_chars_exp edges-$task" - kuberun elmo-ortho-$task "elmo_ortho_exp edges-$task 0" - kuberun elmo-full-$task "elmo_full_exp edges-$task" + # ELMo is currently broken at master, so skip these. + # kuberun elmo-chars-$task "elmo_chars_exp edges-$task" + # kuberun elmo-ortho-$task "elmo_ortho_exp edges-$task 0" + # kuberun elmo-full-$task "elmo_full_exp edges-$task" kuberun glove-$task "glove_exp edges-$task" kuberun cove-$task "cove_exp edges-$task" @@ -120,8 +131,7 @@ do done ## -# Run these on 'jsalt-central' for V100s -gcloud container clusters get-credentials --zone us-central1-a jsalt-central +# Run the larger experiments (transformers) on v100s export GPU_TYPE="v100" for task in "${ALL_TASKS[@]}" do diff --git a/scripts/edgeprobing/ortho_elmo_baselines.sh b/scripts/edgeprobing/ortho_elmo_baselines.sh deleted file mode 100755 index bd4355d28..000000000 --- a/scripts/edgeprobing/ortho_elmo_baselines.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash - -# Script to run edge probing on ELMo-only. -# Run as separate experiments, since datasets are disjoint anyway. - -NOTIFY_EMAIL=$1 - -function run_exp() { - OVERRIDES="exp_name=elmo-ortho-$1, run_name=run_seed_$2" - OVERRIDES+=", pretrain_tasks=$1, elmo_chars_only=0" - OVERRIDES+=", target_tasks=$1, elmo_weight_file_path=/nfs/jsalt/home/berlin/elmo_2x4096_512_2048cnn_2xhighway_weights_ortho_seed_$2.hdf5" - python main.py --config_file config/edgeprobe/edgeprobe_bare.conf \ - -o "${OVERRIDES}" \ - --remote_log --notify "$NOTIFY_EMAIL" -} - -set -eux - -cd $(dirname $0) -pushd "${PWD%jiant*}/jiant" - -run_exp "edges-srl-conll2005" "0" -run_exp "edges-srl-conll2005" "1" -run_exp "edges-srl-conll2005" "2" -run_exp "edges-spr2" "0" -run_exp "edges-spr2" "1" -run_exp "edges-spr2" "2" -run_exp "edges-dpr" "0" -run_exp "edges-dpr" "1" -run_exp "edges-dpr" "2" -run_exp "edges-coref-ontonotes" "0" -run_exp "edges-coref-ontonotes" "1" -run_exp "edges-coref-ontonotes" "2" -run_exp "edges-dep-labeling" "0" -run_exp "edges-dep-labeling" "1" -run_exp "edges-dep-labeling" "2" -run_exp "edges-ner-conll2003" "0" -run_exp "edges-ner-conll2003" "1" -run_exp "edges-ner-conll2003" "2" -run_exp "edges-constituent-ptb" "0" -run_exp "edges-constituent-ptb" "1" -run_exp "edges-constituent-ptb" "2" - -#sudo poweroff diff --git a/scripts/edgeprobing/random_elmo_baselines.sh b/scripts/edgeprobing/random_elmo_baselines.sh deleted file mode 100755 index 91b22e7c5..000000000 --- a/scripts/edgeprobing/random_elmo_baselines.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash - -# Script to run edge probing on ELMo-only. -# Run as separate experiments, since datasets are disjoint anyway. - -NOTIFY_EMAIL=$1 - -function run_exp() { - OVERRIDES="exp_name=elmo-random-$1, run_name=run_seed_$2" - OVERRIDES+=", pretrain_tasks=$1, elmo_chars_only=0" - OVERRIDES+=", target_tasks=$1, elmo_weight_file_path=/nfs/jsalt/home/berlin/elmo_2x4096_512_2048cnn_2xhighway_weights_random_seed_$2.hdf5" - python main.py --config_file config/edgeprobe/edgeprobe_bare.conf \ - -o "${OVERRIDES}" \ - --remote_log --notify "$NOTIFY_EMAIL" -} - -set -eux - -cd $(dirname $0) -pushd "${PWD%jiant*}/jiant" - -run_exp "edges-srl-conll2005" "0" -run_exp "edges-srl-conll2005" "1" -run_exp "edges-srl-conll2005" "2" -run_exp "edges-spr2" "0" -run_exp "edges-spr2" "1" -run_exp "edges-spr2" "2" -run_exp "edges-dpr" "0" -run_exp "edges-dpr" "1" -run_exp "edges-dpr" "2" -run_exp "edges-coref-ontonotes" "0" -run_exp "edges-coref-ontonotes" "1" -run_exp "edges-coref-ontonotes" "2" -run_exp "edges-dep-labeling" "0" -run_exp "edges-dep-labeling" "1" -run_exp "edges-dep-labeling" "2" -run_exp "edges-ner-conll2003" "0" -run_exp "edges-ner-conll2003" "1" -run_exp "edges-ner-conll2003" "2" -run_exp "edges-constituent-ptb" "0" -run_exp "edges-constituent-ptb" "1" -run_exp "edges-constituent-ptb" "2" - -#sudo poweroff diff --git a/scripts/edgeprobing/train_chars_baselines.sh b/scripts/edgeprobing/train_chars_baselines.sh deleted file mode 100755 index 838085a8e..000000000 --- a/scripts/edgeprobing/train_chars_baselines.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/bash - -# Script to run edge probing on ELMo char-CNN only. -# Run as separate experiments, since datasets are disjoint anyway. - -NOTIFY_EMAIL=$1 - -function run_exp() { - OVERRIDES="exp_name=train-chars-$1, run_name=run" - OVERRIDES+=", pretrain_tasks=$1, max_vals=$2, val_interval=$3" - python main.py --config_file config/edgeprobe/edgeprobe_train.conf \ - -o "${OVERRIDES}" \ - --remote_log --notify "$NOTIFY_EMAIL" -} - -set -eux - -cd $(dirname $0) -pushd "${PWD%jiant*}/jiant" - -# Small tasks -run_exp "edges-spr2" 100 100 -run_exp "edges-dpr" 100 100 -run_exp "edges-dep-labeling" 200 500 -run_exp "edges-ner-conll2003" 200 250 - -# OntoNotes -run_exp "edges-srl-conll2012" 200 1000 -run_exp "edges-coref-ontonotes-conll" 200 1000 -run_exp "edges-ner-ontonotes" 200 1000 -run_exp "edges-constituent-ontonotes" 200 1000 - -# run_exp "edges-srl-conll2005" -# run_exp "edges-coref-ontonotes" -# run_exp "edges-constituent-ptb" -# run_exp "edges-ccg-tag" diff --git a/scripts/edgeprobing/train_full_baselines.sh b/scripts/edgeprobing/train_full_baselines.sh deleted file mode 100755 index 726419414..000000000 --- a/scripts/edgeprobing/train_full_baselines.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/bash - -# Script to run edge probing on ELMo char-CNN only. -# Run as separate experiments, since datasets are disjoint anyway. - -NOTIFY_EMAIL=$1 - -function run_exp() { - OVERRIDES="exp_name=train-full-$1, run_name=run" - OVERRIDES+=", pretrain_tasks=$1, max_vals=$2, val_interval=$3, elmo_chars_only=0" - python main.py --config_file config/edgeprobe/edgeprobe_train.conf \ - -o "${OVERRIDES}" \ - --remote_log --notify "$NOTIFY_EMAIL" -} - -set -eux - -cd $(dirname $0) -pushd "${PWD%jiant*}/jiant" - -# Small tasks -run_exp "edges-spr2" 100 100 -run_exp "edges-dpr" 100 100 -run_exp "edges-dep-labeling" 200 500 -run_exp "edges-ner-conll2003" 200 250 - -# OntoNotes -run_exp "edges-srl-conll2012" 200 1000 -run_exp "edges-coref-ontonotes-conll" 200 1000 -run_exp "edges-ner-ontonotes" 200 1000 -run_exp "edges-constituent-ontonotes" 200 1000 - -# run_exp "edges-srl-conll2005" -# run_exp "edges-coref-ontonotes" -# run_exp "edges-constituent-ptb" -# run_exp "edges-ccg-tag"