Skip to content

Commit

Permalink
Update edge probing runner and tokenization scripts (#858)
Browse files Browse the repository at this point in the history
* Update NFS paths

* Remove unused experiment scripts

* Fix edge probing configs

* Add username to Kubernetes jobs

Plays nicer on multi-user environments, now everything is written to
/nfs/jiant/exp/$USER/$PROJECT_NAME and jobs are prefixed with $USER

* Move args to top of retokenize script for better documentation

* Update retokenize_bert.sh

* Update edge probing experiment script

* Add username to job name and project dir for Kubernetes runs

Better default behavior on a multi-user cluster.

* Formatter
  • Loading branch information
iftenney committed Jul 22, 2019
1 parent 5804e55 commit 7ff148e
Show file tree
Hide file tree
Showing 9 changed files with 66 additions and 211 deletions.
2 changes: 0 additions & 2 deletions config/edgeprobe/edgeprobe_bare.conf
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ lr_patience = 5 // vals until LR decay
patience = 20 // vals until early-stopping

tokenizer = "" // use native tokenization with ELMo
elmo = 1
elmo_chars_only = 1 // set to 0 to use full ELMo

// Use no-op encoder (no params).
sent_enc = "none"
Expand Down
35 changes: 20 additions & 15 deletions probing/retokenize_bert.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,34 +2,39 @@

set -eu

EDGE_DATA_PATH="/nfs/jsalt/share/glue_data/edges"
EDGE_DATA_PATH="$JIANT_DATA_DIR/edges"
echo "Processing edge probing data in $EDGE_DATA_PATH"

declare -a SUBPATHS
SUBPATHS+=( "ontonotes-constituents" )
SUBPATHS+=( "dep_ewt" )
SUBPATHS+=( "ontonotes-ner" )
SUBPATHS+=( "srl_conll2012" )
SUBPATHS+=( "ontonotes-coref-conll" )
SUBPATHS+=( "spr1" )
SUBPATHS+=( "spr2" )
SUBPATHS+=( "dpr" )
SUBPATHS+=( "dep_ewt" )
SUBPATHS+=( "ontonotes/const/pos" )
SUBPATHS+=( "ontonotes/const/nonterminal" )
SUBPATHS+=( "ontonotes/srl" )
SUBPATHS+=( "ontonotes/ner" )
SUBPATHS+=( "ontonotes/coref" )
SUBPATHS+=( "semeval" )
SUBPATHS+=( "tacred/rel" )
SUBPATHS+=( "noun_verb" )

for subpath in "${SUBPATHS[@]}"; do
python $(dirname $0)/retokenize_edge_data.bert.py \
--model bert-base-uncased $EDGE_DATA_PATH/$subpath/*.json &
python $(dirname $0)/retokenize_edge_data.bert.py \
--model bert-large-uncased $EDGE_DATA_PATH/$subpath/*.json &
python $(dirname $0)/retokenize_edge_data.py \
-t bert-base-uncased $EDGE_DATA_PATH/$subpath/*.json &
python $(dirname $0)/retokenize_edge_data.py \
-t bert-large-uncased $EDGE_DATA_PATH/$subpath/*.json &
done

# exit 0

# Only use the cased model on NER, per https://arxiv.org/pdf/1810.04805.pdf
CASED_SUBPATHS=( "ontonotes-ner" )
CASED_SUBPATHS=( "ontonotes/ner" )

for subpath in "${CASED_SUBPATHS[@]}"; do
python $(dirname $0)/retokenize_edge_data.bert.py \
--model bert-base-cased $EDGE_DATA_PATH/$subpath/*.json &
python $(dirname $0)/retokenize_edge_data.bert.py \
--model bert-large-cased $EDGE_DATA_PATH/$subpath/*.json &
python $(dirname $0)/retokenize_edge_data.py \
-t bert-base-cased $EDGE_DATA_PATH/$subpath/*.json &
python $(dirname $0)/retokenize_edge_data.py \
-t bert-large-cased $EDGE_DATA_PATH/$subpath/*.json &
done

16 changes: 7 additions & 9 deletions probing/retokenize_edge_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,12 @@

log.basicConfig(format="%(asctime)s: %(message)s", datefmt="%m/%d %I:%M:%S %p", level=log.INFO)

PARSER = argparse.ArgumentParser()
PARSER.add_argument("-t", dest="tokenizer_name", type=str, required=True, help="Tokenizer name.")
PARSER.add_argument(
"--num_parallel", type=int, default=4, help="Number of parallel processes to use."
)
PARSER.add_argument("inputs", type=str, nargs="+", help="Input JSON files.")

# For now, this module expects MosesTokenizer as the default.
# TODO: change this once we have better support in core utils.
Expand Down Expand Up @@ -81,15 +87,7 @@ def retokenize_file(fname, tokenizer_name, worker_pool):


def main(args):
parser = argparse.ArgumentParser()
parser.add_argument(
"-t", dest="tokenizer_name", type=str, required=True, help="Tokenizer name."
)
parser.add_argument(
"--num_parallel", type=int, default=4, help="Number of parallel processes to use."
)
parser.add_argument("inputs", type=str, nargs="+", help="Input JSON files.")
args = parser.parse_args(args)
args = PARSER.parse_args(args)

worker_pool = multiprocessing.Pool(args.num_parallel)
for fname in args.inputs:
Expand Down
26 changes: 15 additions & 11 deletions scripts/edgeprobing/exp_fns.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
#
# See individual functions below for usage.

EP_RESOURCE_DIR="/nfs/jiant/share/edge-probing/resources"

function run_exp() {
# Helper function to invoke main.py.
# Don't run this directly - use the experiment functions below,
Expand All @@ -34,34 +36,34 @@ function elmo_chars_exp() {
# Lexical baseline, probe ELMo char CNN layer.
# Usage: elmo_chars_exp <task_name>
OVERRIDES="exp_name=elmo-chars-$1, run_name=run"
OVERRIDES+=", target_tasks=$1"
OVERRIDES+=", target_tasks=$1, input_module=elmo-chars-only"
run_exp "config/edgeprobe/edgeprobe_bare.conf" "${OVERRIDES}"
}

function elmo_full_exp() {
# Full ELMo, probe full ELMo with learned mixing weights.
# Usage: elmo_full_exp <task_name>
OVERRIDES="exp_name=elmo-full-$1, run_name=run"
OVERRIDES+=", target_tasks=$1, elmo_chars_only=0"
OVERRIDES+=", target_tasks=$1, input_module=elmo"
run_exp "config/edgeprobe/edgeprobe_bare.conf" "${OVERRIDES}"
}

function elmo_ortho_exp() {
# Full ELMo with random orthogonal weights for LSTM and projections.
# Usage: elmo_ortho_exp <task_name> <random_seed>
ELMO_WEIGHTS_PATH="/nfs/jsalt/share/random_elmo/elmo_2x4096_512_2048cnn_2xhighway_weights_ortho_seed_$2.hdf5"
ELMO_WEIGHTS_PATH="${EP_RESOURCE_DIR}/random_elmo/elmo_2x4096_512_2048cnn_2xhighway_weights_ortho_seed_$2.hdf5"
OVERRIDES="exp_name=elmo-ortho-$1, run_name=run_seed_$2"
OVERRIDES+=", target_tasks=$1, elmo_chars_only=0"
OVERRIDES+=", target_tasks=$1, input_module=elmo"
OVERRIDES+=", elmo_weight_file_path=${ELMO_WEIGHTS_PATH}"
run_exp "config/edgeprobe/edgeprobe_bare.conf" "${OVERRIDES}"
}

function elmo_random_exp() {
# Full ELMo with random normal weights for LSTM and projections.
# Usage: elmo_random_exp <task_name> <random_seed>
ELMO_WEIGHTS_PATH="/nfs/jsalt/share/random_elmo/elmo_2x4096_512_2048cnn_2xhighway_weights_random_seed_$2.hdf5"
ELMO_WEIGHTS_PATH="${EP_RESOURCE_DIR}/random_elmo/elmo_2x4096_512_2048cnn_2xhighway_weights_random_seed_$2.hdf5"
OVERRIDES="exp_name=elmo-random-$1, run_name=run_seed_$2"
OVERRIDES+=", target_tasks=$1, elmo_chars_only=0"
OVERRIDES+=", target_tasks=$1, input_module=elmo"
OVERRIDES+=", elmo_weight_file_path=${ELMO_WEIGHTS_PATH}"
run_exp "config/edgeprobe/edgeprobe_bare.conf" "${OVERRIDES}"
}
Expand All @@ -71,14 +73,16 @@ function train_chars_exp() {
# Usage: train_chars_exp <task_name> <max_vals> <val_interval>
OVERRIDES="exp_name=train-chars-$1, run_name=run"
OVERRIDES+=", pretrain_tasks=$1, max_vals=$2, val_interval=$3"
OVERRIDES+=", input_module=elmo-chars-only"
run_exp "config/edgeprobe/edgeprobe_train.conf" "${OVERRIDES}"
}

function train_full_exp() {
# Trained encoder over full ELMo.
# Usage: train_full_exp <task_name> <max_vals> <val_interval>
OVERRIDES="exp_name=train-full-$1, run_name=run"
OVERRIDES+=", pretrain_tasks=$1, max_vals=$2, val_interval=$3, elmo_chars_only=0"
OVERRIDES+=", pretrain_tasks=$1, max_vals=$2, val_interval=$3"
OVERRIDES+=", input_module=elmo"
run_exp "config/edgeprobe/edgeprobe_train.conf" "${OVERRIDES}"
}

Expand Down Expand Up @@ -140,7 +144,7 @@ function openai_mix_exp() {
function openai_bwb_exp() {
# Probe the OpenAI transformer model, as trained on BWB-shuffled.
# Usage: openai_bwb_exp <task_name>
CKPT_PATH="/nfs/jsalt/home/iftenney/checkpoints/bwb_shuffled/model.ckpt-1000000"
CKPT_PATH="${EP_RESOURCE_DIR}/checkpoints/bwb_shuffled/model.ckpt-1000000"
OVERRIDES="exp_name=openai-bwb-$1, run_name=run"
OVERRIDES+=", target_tasks=$1"
OVERRIDES+=", openai_transformer_ckpt=${CKPT_PATH}"
Expand All @@ -157,7 +161,7 @@ function bert_cat_exp() {
# Usage: bert_cat_exp <task_name>
OVERRIDES="exp_name=bert-${2}-cat-${1}, run_name=run"
OVERRIDES+=", target_tasks=$1"
OVERRIDES+=", bert_model_name=bert-$2"
OVERRIDES+=", input_module=bert-$2"
OVERRIDES+=", bert_embeddings_mode=cat"
run_exp "config/edgeprobe/edgeprobe_bert.conf" "${OVERRIDES}"
}
Expand All @@ -167,7 +171,7 @@ function bert_lex_exp() {
# Usage: bert_lex_exp <task_name>
OVERRIDES="exp_name=bert-${2}-lex-${1}, run_name=run"
OVERRIDES+=", target_tasks=$1"
OVERRIDES+=", bert_model_name=bert-$2"
OVERRIDES+=", input_module=bert-$2"
OVERRIDES+=", bert_embeddings_mode=only"
run_exp "config/edgeprobe/edgeprobe_bert.conf" "${OVERRIDES}"
}
Expand All @@ -177,7 +181,7 @@ function bert_mix_exp() {
# Usage: bert_mix_exp <task_name>
OVERRIDES="exp_name=bert-${2}-mix-${1}, run_name=run"
OVERRIDES+=", target_tasks=$1"
OVERRIDES+=", bert_model_name=bert-$2"
OVERRIDES+=", input_module=bert-$2"
OVERRIDES+=", bert_embeddings_mode=mix"
run_exp "config/edgeprobe/edgeprobe_bert.conf" "${OVERRIDES}"
}
38 changes: 24 additions & 14 deletions scripts/edgeprobing/kubernetes_run_all.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,14 @@
set -e

# Default arguments.
PROJECT=""
PROJECT_NAME=""
NOTIFY_EMAIL="$NOTIFY_EMAIL" # allow pre-set from shell

# Handle flags.
OPTIND=1 # Reset in case getopts has been used previously in the shell.
while getopts ":p:n:" opt; do
case "$opt" in
p) PROJECT=$OPTARG
p) PROJECT_NAME=$OPTARG
;;
n) NOTIFY_EMAIL=$OPTARG
;;
Expand All @@ -34,7 +34,7 @@ shift $((OPTIND-1))
# Remaining positional arguments.
MODE=${1:-"create"}

if [ -z $PROJECT ]; then
if [ -z $PROJECT_NAME ]; then
echo "You must provide a project name!"
exit 1
fi
Expand All @@ -47,8 +47,18 @@ fi
# Top-level directory for the current repo.
pushd $(git rev-parse --show-toplevel)

# Get the NFS path from the Kubernetes config, so that it doesn't need to be
# hardcoded here.
pushd gcp/kubernetes/templates
NFS_EXP_DIR=$(jsonnet -S -e "local env = import 'jiant_env.libsonnet'; env.nfs_exp_dir")
echo "Assuming NFS experiment path at $NFS_EXP_DIR"
popd

# Make a copy of the current tree in the project directory.
PROJECT_DIR="/nfs/jsalt/exp/$PROJECT"
PROJECT_DIR="${NFS_EXP_DIR}/${USER}/${PROJECT_NAME}"
if [ ! -d "${NFS_EXP_DIR}/$USER" ]; then
mkdir "${NFS_EXP_DIR}/$USER"
fi
if [ ! -d "${PROJECT_DIR}" ]; then
echo "Creating project directory ${PROJECT_DIR}"
mkdir ${PROJECT_DIR}
Expand All @@ -65,7 +75,7 @@ function make_kubernetes_command() {
# Uses exp_fns.sh to generate configs; see that file for details
# and to define new experiments.
echo -n "pushd ${PATH_TO_JIANT}"
echo -n "; source scripts/edges/exp_fns.sh"
echo -n "; source scripts/edgeprobing/exp_fns.sh"
echo -n "; $@"
}

Expand All @@ -74,8 +84,9 @@ function kuberun() {
NAME=$1
COMMAND=$(make_kubernetes_command $2)
echo "Job '$NAME': '$COMMAND'"
./gcp/kubernetes/run_batch.sh -m $MODE -p ${PROJECT} -g ${GPU_TYPE} \
-n ${NOTIFY_EMAIL} $NAME "$COMMAND"
./gcp/kubernetes/run_batch.sh -m $MODE -p ${PROJECT_NAME} -g ${GPU_TYPE} \
$NAME "$COMMAND"
# -n ${NOTIFY_EMAIL} \ # Temporarily disabled
echo ""
}

Expand Down Expand Up @@ -103,14 +114,14 @@ if [[ $MODE == "delete" ]]; then
fi

##
# Run these on the main 'jsalt' cluster
gcloud container clusters get-credentials --zone us-east1-c jsalt
# Run these on p100s (default)
export GPU_TYPE="p100"
for task in "${ALL_TASKS[@]}"
do
kuberun elmo-chars-$task "elmo_chars_exp edges-$task"
kuberun elmo-ortho-$task "elmo_ortho_exp edges-$task 0"
kuberun elmo-full-$task "elmo_full_exp edges-$task"
# ELMo is currently broken at master, so skip these.
# kuberun elmo-chars-$task "elmo_chars_exp edges-$task"
# kuberun elmo-ortho-$task "elmo_ortho_exp edges-$task 0"
# kuberun elmo-full-$task "elmo_full_exp edges-$task"
kuberun glove-$task "glove_exp edges-$task"
kuberun cove-$task "cove_exp edges-$task"

Expand All @@ -120,8 +131,7 @@ do
done

##
# Run these on 'jsalt-central' for V100s
gcloud container clusters get-credentials --zone us-central1-a jsalt-central
# Run the larger experiments (transformers) on v100s
export GPU_TYPE="v100"
for task in "${ALL_TASKS[@]}"
do
Expand Down
44 changes: 0 additions & 44 deletions scripts/edgeprobing/ortho_elmo_baselines.sh

This file was deleted.

44 changes: 0 additions & 44 deletions scripts/edgeprobing/random_elmo_baselines.sh

This file was deleted.

Loading

0 comments on commit 7ff148e

Please sign in to comment.