diff --git a/aztk/client/cluster/helpers/create.py b/aztk/client/cluster/helpers/create.py index 736c79fa..0f86f982 100644 --- a/aztk/client/cluster/helpers/create.py +++ b/aztk/client/cluster/helpers/create.py @@ -1,8 +1,9 @@ from datetime import timedelta + import azure.batch.models as batch_models from aztk import models -from aztk.utils import helpers, constants +from aztk.utils import constants, helpers def create_pool_and_job(core_cluster_operations, cluster_conf: models.ClusterConfiguration, software_metadata_key: str, @@ -36,7 +37,11 @@ def create_pool_and_job(core_cluster_operations, cluster_conf: models.ClusterCon pool = batch_models.PoolAddParameter( id=pool_id, virtual_machine_configuration=batch_models.VirtualMachineConfiguration( - image_reference=image_ref_to_use, node_agent_sku_id=sku_to_use), + image_reference=image_ref_to_use, + node_agent_sku_id=sku_to_use, + container_configuration=batch_models.ContainerConfiguration( + container_image_names=["aztk/spark:v0.1.0-spark2.3.0-base"] # TODO: parametize container + )), vm_size=cluster_conf.vm_size, enable_auto_scale=True, auto_scale_formula=auto_scale_formula, diff --git a/aztk/node_scripts/setup_host.sh b/aztk/node_scripts/setup_host.sh index c23ff0dd..b7cf6095 100644 --- a/aztk/node_scripts/setup_host.sh +++ b/aztk/node_scripts/setup_host.sh @@ -11,12 +11,10 @@ export PYTHONUNBUFFERED=TRUE container_name=$1 docker_repo_name=$2 + install_prerequisites () { echo "Installing pre-reqs" - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - - add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" - packages=( apt-transport-https curl @@ -24,21 +22,16 @@ install_prerequisites () { software-properties-common python3-pip python3-venv - docker-ce ) echo "running apt-get install -y --no-install-recommends \"${packages[@]}\"" apt-get -y update && apt-get install -y --no-install-recommends "${packages[@]}" - if [ $AZTK_GPU_ENABLED == "true" ]; then - apt-get install -y nvidia-384 nvidia-modprobe - wget -P /tmp https://github.com/NVIDIA/nvidia-docker/releases/download/v1.0.1/nvidia-docker_1.0.1-1_amd64.deb - sudo dpkg -i /tmp/nvidia-docker*.deb && rm /tmp/nvidia-docker*.deb - fi echo "Finished installing pre-reqs" } + install_docker_compose () { echo "Installing Docker-Compose" sudo curl -L https://github.com/docker/compose/releases/download/1.19.0/docker-compose-`uname -s`-`uname -m` -o /usr/local/bin/docker-compose @@ -46,28 +39,15 @@ install_docker_compose () { echo "Finished installing Docker-Compose" } -pull_docker_container () { - echo "Pulling $docker_repo_name" - - if [ -z "$DOCKER_USERNAME" ]; then - echo "No Credentials provided. No need to login to dockerhub" - else - echo "Docker credentials provided. Login in." - docker login $DOCKER_ENDPOINT --username $DOCKER_USERNAME --password $DOCKER_PASSWORD - fi - - docker pull $docker_repo_name - echo "Finished pulling $docker_repo_name" -} install_python_dependencies () { echo "Installing python dependencies" pipenv install --python /usr/bin/python3.5m pipenv run pip install --upgrade setuptools wheel #TODO: add pip when pipenv is compatible with pip10 echo "Finished installing python dependencies" - } + run_docker_container () { echo "Running docker container" @@ -96,13 +76,9 @@ run_docker_container () { ln -s $docker_log $AZ_BATCH_TASK_WORKING_DIR/logs/docker.log fi echo "Finished running docker container" - } - - - main () { time( @@ -119,9 +95,6 @@ main () { install_docker_compose ) 2>&1 - time( - pull_docker_container - ) 2>&1 # Unzip resource files and set permissions chmod 777 $AZTK_WORKING_DIR/aztk/node_scripts/docker_main.sh diff --git a/aztk/spark/client/base/helpers/generate_cluster_start_task.py b/aztk/spark/client/base/helpers/generate_cluster_start_task.py index 0b100bf1..a44e3733 100644 --- a/aztk/spark/client/base/helpers/generate_cluster_start_task.py +++ b/aztk/spark/client/base/helpers/generate_cluster_start_task.py @@ -8,7 +8,6 @@ from aztk.spark import models from aztk.spark.utils import util from aztk.utils import constants, helpers -from aztk.spark import models POOL_ADMIN_USER_IDENTITY = batch_models.UserIdentity( auto_user=batch_models.AutoUserSpecification( @@ -89,6 +88,8 @@ def __cluster_install_cmd(zip_resource_file: batch_models.ResourceFile, setup = [ 'time('\ 'apt-get -y update;'\ + 'n=0;'\ + 'until [ $n -ge 15 ]; do apt-get -y --no-install-recommends install unzip && break; n=$[$n+1]; ps faux; sleep 5; echo "waited $n*5 seconds"; done;'\ 'apt-get -y --no-install-recommends install unzip;'\ 'unzip -o $AZ_BATCH_TASK_WORKING_DIR/{0};'\ 'chmod 777 $AZ_BATCH_TASK_WORKING_DIR/aztk/node_scripts/setup_host.sh;'\ @@ -145,4 +146,5 @@ def generate_cluster_start_task(core_base_operations, resource_files=resource_files, environment_settings=environment_settings, user_identity=POOL_ADMIN_USER_IDENTITY, - wait_for_success=True) + wait_for_success=True, + max_task_retry_count=2) diff --git a/aztk/utils/helpers.py b/aztk/utils/helpers.py index cd469889..b4453436 100644 --- a/aztk/utils/helpers.py +++ b/aztk/utils/helpers.py @@ -1,20 +1,23 @@ from __future__ import print_function + import datetime import io +import logging import os -import time import re -import azure.common -import azure.batch.batch_service_client as batch +import time + import azure.batch.batch_auth as batch_auth +import azure.batch.batch_service_client as batch import azure.batch.models as batch_models +import azure.common import azure.storage.blob as blob -from aztk.version import __version__ -from aztk.utils import constants -from aztk import error -import aztk.models import yaml -import logging + +import aztk.models +from aztk import error +from aztk.utils import constants +from aztk.version import __version__ _STANDARD_OUT_FILE_NAME = 'stdout.txt' _STANDARD_ERROR_FILE_NAME = 'stderr.txt' @@ -179,19 +182,10 @@ def select_latest_verified_vm_image_with_node_agent_sku(publisher, offer, sku_st :rtype: tuple :return: (node agent sku id to use, vm image ref to use) """ - # get verified vm image list and node agent sku ids from service - node_agent_skus = batch_client.account.list_node_agent_skus() - - # pick the latest supported sku - skus_to_use = [(sku, image_ref) - for sku in node_agent_skus - for image_ref in sorted(sku.verified_image_references, key=lambda item: item.sku) - if image_ref.publisher.lower() == publisher.lower() and image_ref.offer.lower() == offer.lower() - and image_ref.sku.startswith(sku_starts_with)] - - # skus are listed in reverse order, pick first for latest - sku_to_use, image_ref_to_use = skus_to_use[0] - return (sku_to_use.id, image_ref_to_use) + image_ref_to_use = batch_models.ImageReference( + publisher='microsoft-azure-batch', offer='ubuntu-server-container', sku='16-04-lts', version='latest') + node_agent_sku_id = 'batch.node.ubuntu 16.04' + return (node_agent_sku_id, image_ref_to_use) def create_sas_token(container_name, blob_name, permission, blob_client, expiry=None, timeout=None): diff --git a/aztk_cli/spark/endpoints/cluster/cluster_ssh.py b/aztk_cli/spark/endpoints/cluster/cluster_ssh.py index ae085809..1b91be22 100644 --- a/aztk_cli/spark/endpoints/cluster/cluster_ssh.py +++ b/aztk_cli/spark/endpoints/cluster/cluster_ssh.py @@ -5,11 +5,10 @@ import aztk from aztk.models import ClusterConfiguration +from aztk.spark.models import PortForwardingSpecification from aztk_cli import config, log, utils from aztk_cli.config import SshConfig -from aztk.spark.models import PortForwardingSpecification - def setup_parser(parser: argparse.ArgumentParser): parser.add_argument('--id', dest="cluster_id", help='The unique id of your spark cluster') @@ -37,7 +36,7 @@ def setup_parser(parser: argparse.ArgumentParser): def execute(args: typing.NamedTuple): spark_client = aztk.spark.Client(config.load_aztk_secrets()) cluster = spark_client.cluster.get(args.cluster_id) - cluster_config = spark_client.cluster.get_cluster_config(args.cluster_id) + cluster_config = spark_client.cluster._core_cluster_operations.get_cluster_config(args.cluster_id) ssh_conf = SshConfig() ssh_conf.merge( diff --git a/aztk_cli/utils.py b/aztk_cli/utils.py index 31f40ae4..6f40539c 100644 --- a/aztk_cli/utils.py +++ b/aztk_cli/utils.py @@ -152,7 +152,7 @@ def ssh_in_master(client, # Get master node id from task (job and task are both named pool_id) cluster = client.cluster.get(cluster_id) - configuration = client.cluster.get_cluster_config(cluster_id) + configuration = client.cluster._core_cluster_operations.get_cluster_config(cluster_id) master_node_id = cluster.master_node_id