Skip to content
This repository has been archived by the owner on Feb 3, 2021. It is now read-only.

Internal: use native batch container feature #646

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions aztk/client/cluster/helpers/create.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from datetime import timedelta

import azure.batch.models as batch_models

from aztk import models
from aztk.utils import helpers, constants
from aztk.utils import constants, helpers


def create_pool_and_job(core_cluster_operations, cluster_conf: models.ClusterConfiguration, software_metadata_key: str,
Expand Down Expand Up @@ -36,7 +37,11 @@ def create_pool_and_job(core_cluster_operations, cluster_conf: models.ClusterCon
pool = batch_models.PoolAddParameter(
id=pool_id,
virtual_machine_configuration=batch_models.VirtualMachineConfiguration(
image_reference=image_ref_to_use, node_agent_sku_id=sku_to_use),
image_reference=image_ref_to_use,
node_agent_sku_id=sku_to_use,
container_configuration=batch_models.ContainerConfiguration(
container_image_names=["aztk/spark:v0.1.0-spark2.3.0-base"] # TODO: parametize container
)),
vm_size=cluster_conf.vm_size,
enable_auto_scale=True,
auto_scale_formula=auto_scale_formula,
Expand Down
33 changes: 3 additions & 30 deletions aztk/node_scripts/setup_host.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,63 +11,43 @@ export PYTHONUNBUFFERED=TRUE
container_name=$1
docker_repo_name=$2


install_prerequisites () {
echo "Installing pre-reqs"

curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -
add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable"

packages=(
apt-transport-https
curl
ca-certificates
software-properties-common
python3-pip
python3-venv
docker-ce
)

echo "running apt-get install -y --no-install-recommends \"${packages[@]}\""
apt-get -y update &&
apt-get install -y --no-install-recommends "${packages[@]}"

if [ $AZTK_GPU_ENABLED == "true" ]; then
apt-get install -y nvidia-384 nvidia-modprobe
wget -P /tmp https://github.com/NVIDIA/nvidia-docker/releases/download/v1.0.1/nvidia-docker_1.0.1-1_amd64.deb
sudo dpkg -i /tmp/nvidia-docker*.deb && rm /tmp/nvidia-docker*.deb
fi
echo "Finished installing pre-reqs"
}


install_docker_compose () {
echo "Installing Docker-Compose"
sudo curl -L https://github.com/docker/compose/releases/download/1.19.0/docker-compose-`uname -s`-`uname -m` -o /usr/local/bin/docker-compose
sudo chmod +x /usr/local/bin/docker-compose
echo "Finished installing Docker-Compose"
}

pull_docker_container () {
echo "Pulling $docker_repo_name"

if [ -z "$DOCKER_USERNAME" ]; then
echo "No Credentials provided. No need to login to dockerhub"
else
echo "Docker credentials provided. Login in."
docker login $DOCKER_ENDPOINT --username $DOCKER_USERNAME --password $DOCKER_PASSWORD
fi

docker pull $docker_repo_name
echo "Finished pulling $docker_repo_name"
}

install_python_dependencies () {
echo "Installing python dependencies"
pipenv install --python /usr/bin/python3.5m
pipenv run pip install --upgrade setuptools wheel #TODO: add pip when pipenv is compatible with pip10
echo "Finished installing python dependencies"

}


run_docker_container () {
echo "Running docker container"

Expand Down Expand Up @@ -96,13 +76,9 @@ run_docker_container () {
ln -s $docker_log $AZ_BATCH_TASK_WORKING_DIR/logs/docker.log
fi
echo "Finished running docker container"

}





main () {

time(
Expand All @@ -119,9 +95,6 @@ main () {
install_docker_compose
) 2>&1

time(
pull_docker_container
) 2>&1

# Unzip resource files and set permissions
chmod 777 $AZTK_WORKING_DIR/aztk/node_scripts/docker_main.sh
Expand Down
6 changes: 4 additions & 2 deletions aztk/spark/client/base/helpers/generate_cluster_start_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from aztk.spark import models
from aztk.spark.utils import util
from aztk.utils import constants, helpers
from aztk.spark import models

POOL_ADMIN_USER_IDENTITY = batch_models.UserIdentity(
auto_user=batch_models.AutoUserSpecification(
Expand Down Expand Up @@ -89,6 +88,8 @@ def __cluster_install_cmd(zip_resource_file: batch_models.ResourceFile,
setup = [
'time('\
'apt-get -y update;'\
'n=0;'\
'until [ $n -ge 15 ]; do apt-get -y --no-install-recommends install unzip && break; n=$[$n+1]; ps faux; sleep 5; echo "waited $n*5 seconds"; done;'\
'apt-get -y --no-install-recommends install unzip;'\
'unzip -o $AZ_BATCH_TASK_WORKING_DIR/{0};'\
'chmod 777 $AZ_BATCH_TASK_WORKING_DIR/aztk/node_scripts/setup_host.sh;'\
Expand Down Expand Up @@ -145,4 +146,5 @@ def generate_cluster_start_task(core_base_operations,
resource_files=resource_files,
environment_settings=environment_settings,
user_identity=POOL_ADMIN_USER_IDENTITY,
wait_for_success=True)
wait_for_success=True,
max_task_retry_count=2)
36 changes: 15 additions & 21 deletions aztk/utils/helpers.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,23 @@
from __future__ import print_function

import datetime
import io
import logging
import os
import time
import re
import azure.common
import azure.batch.batch_service_client as batch
import time

import azure.batch.batch_auth as batch_auth
import azure.batch.batch_service_client as batch
import azure.batch.models as batch_models
import azure.common
import azure.storage.blob as blob
from aztk.version import __version__
from aztk.utils import constants
from aztk import error
import aztk.models
import yaml
import logging

import aztk.models
from aztk import error
from aztk.utils import constants
from aztk.version import __version__

_STANDARD_OUT_FILE_NAME = 'stdout.txt'
_STANDARD_ERROR_FILE_NAME = 'stderr.txt'
Expand Down Expand Up @@ -179,19 +182,10 @@ def select_latest_verified_vm_image_with_node_agent_sku(publisher, offer, sku_st
:rtype: tuple
:return: (node agent sku id to use, vm image ref to use)
"""
# get verified vm image list and node agent sku ids from service
node_agent_skus = batch_client.account.list_node_agent_skus()

# pick the latest supported sku
skus_to_use = [(sku, image_ref)
for sku in node_agent_skus
for image_ref in sorted(sku.verified_image_references, key=lambda item: item.sku)
if image_ref.publisher.lower() == publisher.lower() and image_ref.offer.lower() == offer.lower()
and image_ref.sku.startswith(sku_starts_with)]

# skus are listed in reverse order, pick first for latest
sku_to_use, image_ref_to_use = skus_to_use[0]
return (sku_to_use.id, image_ref_to_use)
image_ref_to_use = batch_models.ImageReference(
publisher='microsoft-azure-batch', offer='ubuntu-server-container', sku='16-04-lts', version='latest')
node_agent_sku_id = 'batch.node.ubuntu 16.04'
return (node_agent_sku_id, image_ref_to_use)


def create_sas_token(container_name, blob_name, permission, blob_client, expiry=None, timeout=None):
Expand Down
5 changes: 2 additions & 3 deletions aztk_cli/spark/endpoints/cluster/cluster_ssh.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,10 @@

import aztk
from aztk.models import ClusterConfiguration
from aztk.spark.models import PortForwardingSpecification
from aztk_cli import config, log, utils
from aztk_cli.config import SshConfig

from aztk.spark.models import PortForwardingSpecification


def setup_parser(parser: argparse.ArgumentParser):
parser.add_argument('--id', dest="cluster_id", help='The unique id of your spark cluster')
Expand Down Expand Up @@ -37,7 +36,7 @@ def setup_parser(parser: argparse.ArgumentParser):
def execute(args: typing.NamedTuple):
spark_client = aztk.spark.Client(config.load_aztk_secrets())
cluster = spark_client.cluster.get(args.cluster_id)
cluster_config = spark_client.cluster.get_cluster_config(args.cluster_id)
cluster_config = spark_client.cluster._core_cluster_operations.get_cluster_config(args.cluster_id)
ssh_conf = SshConfig()

ssh_conf.merge(
Expand Down
2 changes: 1 addition & 1 deletion aztk_cli/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ def ssh_in_master(client,

# Get master node id from task (job and task are both named pool_id)
cluster = client.cluster.get(cluster_id)
configuration = client.cluster.get_cluster_config(cluster_id)
configuration = client.cluster._core_cluster_operations.get_cluster_config(cluster_id)

master_node_id = cluster.master_node_id

Expand Down