allenai · ItzVladick · Jul 16, 2020 · Jul 16, 2020 · Jul 17, 2020 · Jul 17, 2020
diff --git a/experiment.yml b/experiment.yml
@@ -0,0 +1,18 @@
+tasks: 
+  - cluster: {{.Env.CLUSTER}}
+    spec:
+      # This is a python3.7/nvidia base image with basic libraries
+      image: im_j69gti4atcw9
+      resultPath: {{.Env.RESULT_PATH}}
+      args:
+        - /bin/bash 
+        - -c
+        - "cd /longformer_on_beaker && pip install . && {{.Env.ARGS}}"
+      datasetMounts:
+        - datasetId: {{.Env.INPUT_DATASET_ID}}
+          containerPath: /data
+        - datasetId: {{.Env.SCRIPTS}}
+          containerPath: /longformer_on_beaker
+      requirements:
+        gpuCount: {{.Env.GPU_COUNT}}
+        cpu: {{.Env.CPU_COUNT}}
diff --git a/longformer_on_beaker.sh b/longformer_on_beaker.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+export SCRIPTS=$(beaker dataset create -q .)
+export INPUT_DATASET_ID="ds_6r0phxc5fiap"
+export RESULT_SAVE_DIR="/runs"
+export RESULT_SAVE_PREFIX="test"
+export ARGS=""
+export GPU_COUNT=1
+export CPU_COUNT=6
+copy=("$@")
+for i in "${!copy[@]}"
+do
+  if [[ "${copy[$i]}" = "--save_dir" ]]
+  then
+    export RESULT_SAVE_DIR="${copy[$i+1]}"
+  fi
+
+  if [[ "${copy[$i]}" = "--input_dir" ]]
+  then
+    export INPUT_DATASET_ID=$(beaker dataset create -q ${copy[$i+1]})
+    copy[$i+1]="/data"
+  fi
+
+  if [[ "${copy[$i]}" = "--save_prefix" ]]
+  then
+    export RESULT_SAVE_PREFIX="${copy[$i+1]}"
+  fi
+
+  if [[ "${copy[$i]}" = "--num_workers" ]]
+  then
+    export CPU_COUNT="${copy[$i+1]}"
+  fi
+
+  if [[ "${copy[$i]}" = "--gpu_count" ]]
+  then
+    export GPU_COUNT="${copy[$i+1]}"
+  fi
+  ARGS="$ARGS ${copy[$i]}"
+done
+
+# If an input dataset was not specified, use the default
+if [[ "ds_6r0phxc5fiap" = $INPUT_DATASET_ID ]]
+then
+  ARGS="$ARGS --input_dir /data"
+fi
+
+echo $ARGS
+
+export RESULT_PATH=$RESULT_SAVE_DIR/$RESULT_SAVE_PREFIX
+
+beaker experiment create -f experiment.yml
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,6 @@
-torch>=1.2.0
-transformers>=3.0.2
+pytorch-lightning @ git+http://github.com/ibeltagy/pytorch-lightning.git@v0.8.5_fixes#egg=pytorch-lightning
+
+torch==1.3.1
+transformers==3.0.2
 tensorboardX
-pytorch-lightning==0.6.0
 test-tube==0.7.5
diff --git a/scripts/cheatsheet.txt b/scripts/cheatsheet.txt
@@ -70,3 +70,18 @@ python -m scripts.triviaqa_utils.evaluation_utils  \
     --prediction_file predictions.json
 # Output should be:
 {'exact_match': 73.07644188665083, 'f1': 77.78523804802242, 'common': 7993, 'denominator': 7993, 'pred_len': 7993, 'gold_len': 7993}
+
+
+# TPU
+import torch_xla.debug.metrics as met; print(met.metrics_report())
+curl -X POST http://10.125.212.42:8475/requestversion/pytorch-dev20200722
+
+/usr/share/torch-xla-nightly/pytorch/xla/scripts/debug_run.py --outfile debug.tar.gz -- python -u scripts/test_tpu.py
+
+/usr/share/torch-xla-nightly/pytorch/xla/scripts/debug_run.py --outfile debug.tar.gz -- python -u scripts/pretrain.py  --input_dir data/ --save_prefix test_xla_2 --gpu_count 0 --tpu_core_count 1 --val_batches 4 --val_every 130 --num_workers 0 --log_rate 1 --model allenai/longformer-base-4096
+
+python scripts/pretrain.py  --input_dir data/ --save_prefix test_grad_accum --gpu_count 0 --tpu_core_count 8 --val_batches 30 --val_every 30 --num_workers 0 --log_rate 1
+
+export TPU_IP_ADDRESS=10.125.212.42
+export XRT_TPU_CONFIG="tpu_worker;0;$TPU_IP_ADDRESS:8470"
+source /anaconda3/bin/activate torch-xla-nightly