-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgcloud-train.sh
executable file
·139 lines (122 loc) · 3.24 KB
/
gcloud-train.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#!/bin/bash
RUNTIME_VERSION="1.13"
program_name=$0
function usage()
{
echo "Train and test model specified by a given configuration file in Google Cloud ML Engine"
echo ""
echo "$program_name --config path/to/config.json --train path/to/train.txt --valid path/to/valid.txt --test path/to/test.txt --bucket bucket_name"
echo -e "\t-h --help"
echo -e "\t--config\tPath to .json file used to configure features and model hyper-parameters"
echo -e "\t--train\t\tPath to training corpus file"
echo -e "\t--valid\t\tPath to validation corpus file"
echo -e "\t--test\t\tComma-separated list of paths to test files (optional)"
echo -e "\t--bucket\tGoogle Cloud Storage bucket name"
echo -e "\t--job-name\tJob name (optional)"
echo -e "\t--runtime\tTensorflow runtime version (optional, ${RUNTIME_VERSION} by default)"
}
while [[ $# -gt 0 ]]
do
key="$1"
debug=''
case ${key} in
-h|--help)
usage
exit
;;
-c|--config)
config=$2
shift
shift
;;
-t|--train)
train_file=$2
shift
shift
;;
-d|-v|--valid|--dev)
valid_file=$2
shift
shift
;;
--test)
comma_separated_test_files=$2
shift
shift
;;
--job-name|--name|--tag)
base_job_name=$2
shift
shift
;;
--bucket)
bucket_name=$2
shift
shift
;;
--runtime)
RUNTIME_VERSION=$2
shift
shift
;;
--debug)
debug='--debug'
shift
shift
;;
*)
echo "Unknown option: $1"
usage
exit 1
;;
esac
done
if [[ -z "CONFIG" ]] || [[ -z "$train_file" ]] || [[ -z "$valid_file" ]] || [[ -z "$bucket_name" ]]; then
usage
exit
fi
now=$(date +"%Y%m%d_%H%M%S")
if [[ -z "$base_job_name" ]]; then
base_job_name=$(basename ${config} .json)
echo "Using default job name (${base_job_name}) since none was provided (use --job-name to specify one)"
fi
job_name="${base_job_name}_${now}"
job_dir=gs://${bucket_name}/experiments/${job_name}
echo "Setting output directory to $job_dir"
python setup.py sdist
gsutil cp dist/tfnlp-1.0.tar.gz ${job_dir}/app.tar.gz
# resolve any local configuration references by building the config locally first
export PYTHONPATH=${PYTHONPATH}:/`pwd`
mkdir /tmp/${job_name}
python tfnlp/config_builder.py --base ${config} --output /tmp/${job_name}/config.json
config=/tmp/${job_name}/config.json
gsutil cp ${config} ${job_dir}/config.json
rm ${config}
gsutil cp ${train_file} ${job_dir}/train.txt
gsutil cp ${valid_file} ${job_dir}/valid.txt
test_arg_str=""
if [[ -n "$comma_separated_test_files" ]]; then
test_arg_str="--test "
for local_test_file in ${comma_separated_test_files//,/ }
do
cloud_test_file="${job_dir}/${local_test_file##*/}"
gsutil cp ${local_test_file} ${cloud_test_file}
test_arg_str="${test_arg_str},${cloud_test_file}"
done
fi
gcloud ai-platform jobs submit training ${job_name} \
--packages ${job_dir}/app.tar.gz \
--config config.yaml \
--runtime-version ${RUNTIME_VERSION} \
--module-name tfnlp.trainer \
--region us-east1 \
--stream-logs \
-- \
--job-dir ${job_dir} \
--train ${job_dir}/train.txt \
--valid ${job_dir}/valid.txt \
${test_arg_str} \
--mode train \
--config ${job_dir}/config.json \
${debug} \
--resources gs://${bucket_name}/resources/