Skip to content

Commit

Permalink
monitor it and analysis
Browse files Browse the repository at this point in the history
  • Loading branch information
Lia committed May 23, 2018
1 parent 6c73d88 commit 7bfe9b4
Show file tree
Hide file tree
Showing 5 changed files with 333 additions and 18 deletions.
288 changes: 288 additions & 0 deletions analysis/test.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,288 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import json\n",
"from pandas.io.json import json_normalize"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"FILE = '../logs/test.json'"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"with open(FILE) as f:\n",
" d = json.load(f)"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
"logs = json_normalize(d)\n",
"logs['start_time'] = pd.to_datetime(logs['start_time'])\n",
"logs['end_time'] = pd.to_datetime(logs['end_time'])\n",
"del logs['losses_val']"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>accuracy</th>\n",
" <th>accuracy_-1</th>\n",
" <th>accuracy_1</th>\n",
" <th>end_time</th>\n",
" <th>n_workers</th>\n",
" <th>running_mode</th>\n",
" <th>running_time</th>\n",
" <th>start_time</th>\n",
" <th>tag</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.795592</td>\n",
" <td>0.991687</td>\n",
" <td>0.583258</td>\n",
" <td>2018-05-22 15:04:55</td>\n",
" <td>2</td>\n",
" <td>synchronous</td>\n",
" <td>26.858709</td>\n",
" <td>2018-05-22 15:04:28</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.687986</td>\n",
" <td>1.000000</td>\n",
" <td>0.316935</td>\n",
" <td>2018-05-22 15:35:06</td>\n",
" <td>2</td>\n",
" <td>synchronous</td>\n",
" <td>22.663874</td>\n",
" <td>2018-05-22 15:34:43</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.734226</td>\n",
" <td>1.000000</td>\n",
" <td>0.414286</td>\n",
" <td>2018-05-22 15:38:23</td>\n",
" <td>2</td>\n",
" <td>synchronous</td>\n",
" <td>23.167867</td>\n",
" <td>2018-05-22 15:38:00</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.796024</td>\n",
" <td>0.990291</td>\n",
" <td>0.573284</td>\n",
" <td>2018-05-22 15:42:08</td>\n",
" <td>2</td>\n",
" <td>synchronous</td>\n",
" <td>24.359314</td>\n",
" <td>2018-05-22 15:41:44</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.884183</td>\n",
" <td>0.971797</td>\n",
" <td>0.782852</td>\n",
" <td>2018-05-22 15:46:49</td>\n",
" <td>2</td>\n",
" <td>synchronous</td>\n",
" <td>24.285353</td>\n",
" <td>2018-05-22 15:46:24</td>\n",
" <td></td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" accuracy accuracy_-1 accuracy_1 end_time n_workers \\\n",
"0 0.795592 0.991687 0.583258 2018-05-22 15:04:55 2 \n",
"1 0.687986 1.000000 0.316935 2018-05-22 15:35:06 2 \n",
"2 0.734226 1.000000 0.414286 2018-05-22 15:38:23 2 \n",
"3 0.796024 0.990291 0.573284 2018-05-22 15:42:08 2 \n",
"4 0.884183 0.971797 0.782852 2018-05-22 15:46:49 2 \n",
"\n",
" running_mode running_time start_time tag \n",
"0 synchronous 26.858709 2018-05-22 15:04:28 \n",
"1 synchronous 22.663874 2018-05-22 15:34:43 \n",
"2 synchronous 23.167867 2018-05-22 15:38:00 \n",
"3 synchronous 24.359314 2018-05-22 15:41:44 \n",
"4 synchronous 24.285353 2018-05-22 15:46:24 "
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"logs.head()"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
"times = [pd.DataFrame(d[i]['losses_val']) for i in range(len(d))]"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>loss_val</th>\n",
" <th>time</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2270.469972</td>\n",
" <td>2018-05-22 15:04:28</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2225.287023</td>\n",
" <td>2018-05-22 15:04:29</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2185.046260</td>\n",
" <td>2018-05-22 15:04:29</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2174.906931</td>\n",
" <td>2018-05-22 15:04:29</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2166.709046</td>\n",
" <td>2018-05-22 15:04:30</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" loss_val time\n",
"0 2270.469972 2018-05-22 15:04:28\n",
"1 2225.287023 2018-05-22 15:04:29\n",
"2 2185.046260 2018-05-22 15:04:29\n",
"3 2174.906931 2018-05-22 15:04:29\n",
"4 2166.709046 2018-05-22 15:04:30"
]
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"times[0].head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
8 changes: 8 additions & 0 deletions monitor-it.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/usr/bin/env bash

while [ $(kubectl get pods | grep coordinator | awk '{print $5}' = "3m") ]
do
sleep 5
done

kill -9 $(ps -a | grep run-in-cluster | awk '{print $1}' | head -n 1)
40 changes: 29 additions & 11 deletions run-in-cluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ DATA_PATH=/data/datasets

while getopts ":n:r:f:" opt; do
case $opt in
n) N_WORKERS="$OPTARG";;
r) RUNNING_MODE="$OPTARG";;
f) FILE_LOG="$OPTARG";;
n) N_WORKERS="$OPTARG";; # number workers
r) RUNNING_MODE="$OPTARG";; # synchronous or asynchronous
f) FILE_LOG="$OPTARG";; # file where the output of the job will be stored
\?) echo "Invalid option -$OPTARG" >&2
;;
esac
Expand All @@ -34,7 +34,8 @@ function shutdown_infra {
fi;
}

# Don't forget to run login first with `docker login`
echo
echo "----- Logging in Docker Hub -----"
docker login --username=$DOCKER_USER --password=$DOCKER_PASS 2> /dev/null

echo
Expand All @@ -46,6 +47,11 @@ echo "----- Building and Pushing docker to Docker Hub -----"
docker build -f `pwd`/Docker/Dockerfile `pwd` -t ${REPO}/${APP_NAME}
docker push ${REPO}/${APP_NAME}

echo
echo
echo "----- Starting Monitoring -----"
bash monitor-it.sh &

echo
echo "----- Starting workers -----"
kubectl create configmap hogwild-config --from-literal=replicas=${N_WORKERS} \
Expand All @@ -59,7 +65,7 @@ do
sleep 1
done


echo
echo
echo "----- Workers are up and running, starting coordinator -----"
kubectl create -f Kubernetes/coordinator.yaml
Expand All @@ -69,6 +75,7 @@ while [ $(kubectl get pods | grep coordinator | grep Running | wc -l) == 0 ]
do
sleep 1
done

echo
echo "----- Running Job -----"

Expand All @@ -88,17 +95,24 @@ done


echo
echo "----- Job Completed, logs available in logs/log_${MY_TIME}.json -----"
echo "----- Job Completed, writing log -----"


if [[ -z $(ls logs | grep ${FILE_LOG}) ]];
then
touch ${FILE_LOG}
fi;
then
touch logs/${FILE_LOG}
fi;

jq -s add logs/${FILE_LOG} logs/log_${MY_TIME}
echo $(jq -s add logs/${FILE_LOG} logs/log_${MY_TIME}.json) > logs/${FILE_LOG}
rm logs/log_${MY_TIME}.json

echo
echo "----- Logs available in ${FILE_LOG} -----"

echo
echo "----- Shutting down monitoring -----"
kill -9 $(ps -a | grep monitor-it | awk '{print $1}' | head -n 1)

echo
echo "----- Shutting down infra -----"
shutdown_infra
Expand All @@ -113,4 +127,8 @@ shutdown_infra
# kubectl logs $APP_NAME -p --container="coordinator"
#kubectl -n my-ns delete po,svc --all
#kubectl delete -f Kubernetes/workers_template.yaml --cascade=true
#kubectl exec -it coordinator-0 -- /bin/bash
#kubectl exec -it coordinator-0 -- /bin/bash


# ➝ kubectl scale --replicas=3 service/workers-service
#error: Scaling the resource failed with: could not fetch the scale for services workers-service: services "workers-service" is forbidden: User "cs449g9" cannot get services/scale in the namespace "cs449g9"
Loading

0 comments on commit 7bfe9b4

Please sign in to comment.