-
Notifications
You must be signed in to change notification settings - Fork 121
Microsoft Azure Distributed Linear Learner Recipe #195
Changes from 98 commits
5712333
0af28f2
9a0dd67
1b16aa7
c163932
98a07e9
89ef9b0
3fc37a2
888dc52
856ea50
4e60f5a
2555c27
55d85c1
67359d0
3bf546c
67f804f
f4b8e18
9c5216c
a66a901
4e5f426
143a10d
5bb6665
1cd05dc
bc0efe3
f05eafc
1c1d640
7683b4b
41d084e
4b639fa
5f376d8
6b1b88b
a2068a8
248593f
76641c8
01e95f9
c84ba61
ea28c16
fc05011
4de9274
4f17145
fb75179
9c26e6a
79a813a
78b9ed1
9b813d3
9f76fd5
307a564
ac26280
f6ac2b7
bd18958
81b286e
cb7ac7e
ee16f40
f7ee9d8
673b9f2
e8c9598
88bcf5a
9813f15
2ebb344
96cd421
4e8ee95
83d1eac
62ea978
ccb261a
d36459a
074fb02
69390c1
73ab17d
3b0c82d
975f375
d8388be
5a61bae
d31011c
1e614d3
a69baa9
49e535a
812c61d
d00f273
827cb82
f2f62ce
7f58039
d54e457
8779498
af7af06
5060e2d
1b3bb9b
2fb8df8
3aafb4c
6f68e2f
088245a
d7fcd30
19c123a
ded5f11
5b7786a
f5e2a81
4b9e891
3bf6aff
61abda8
549b575
6769a76
327c1ac
ef9e4bb
707ed25
179b49e
68328d7
012e2dd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,149 @@ | ||
from __future__ import print_function | ||
import json | ||
import os | ||
import sys | ||
import tempfile | ||
from azure.storage.blob import BlockBlobService | ||
from os import listdir | ||
from os.path import isfile, join | ||
|
||
# make sure config data is encode it correctly | ||
def encode(value): | ||
if isinstance(value, type('str')): | ||
return value | ||
return value.encode('utf-8') | ||
|
||
# configuration class | ||
class Configuration: | ||
|
||
def __init__(self, file_name): | ||
if not os.path.exists(file_name): | ||
raise ValueError('Cannot find configuration file "{0}"'. | ||
format(file_name)) | ||
|
||
with open(file_name, 'r') as f: | ||
conf = json.load(f) | ||
|
||
try: | ||
self.node_count = encode(conf['node_count']) | ||
self.thread_count = encode(conf['thread_count']) | ||
self.training_data_shred_count = encode(conf['training_data_shred_count']) | ||
self.dataset_local_directory = encode(conf['dataset_local_directory']) | ||
self.shredded_dataset_local_directory = encode(conf['shredded_dataset_local_directory']) | ||
self.shredded_dataset_Per_Node = encode(conf['shredded_dataset_Per_Node']) | ||
self.container_name = encode(conf['container_name']) | ||
self.trainind_dataset_name = encode(conf['trainind_dataset_name']) | ||
self.training_data_container_name = encode(conf['training_data_container_name']) | ||
self.subscription_id = encode(conf['subscription_id']) | ||
self.secret_key = encode(conf['secret_key']) | ||
self.resource_group = encode(conf['resource_group']) | ||
self.storage_account_name = encode(conf['storage_account']['name']) | ||
self.storage_account_key = encode(conf['storage_account']['key']) | ||
except KeyError as err: | ||
raise AttributeError('Please provide a value for "{0}" configuration key'.format(err.args[0])) | ||
|
||
# load the configuration data | ||
cfg = Configuration('configuration.json') | ||
|
||
# azure block service object | ||
blob_service = BlockBlobService(cfg.storage_account_name, cfg.storage_account_key) | ||
|
||
# container name | ||
azure_blob_container_name = cfg.container_name | ||
|
||
# training data container name | ||
azure_blob_training_data_container_name = cfg.training_data_container_name | ||
|
||
# create the container that will host the data blobs | ||
blob_service.create_container(azure_blob_container_name, fail_on_exist=False) | ||
|
||
|
||
# the function that load the data from the training blob, partition the data | ||
# and upload it to the container blobs | ||
def partition_and_upload_dataset_to_blob(blob_service, azure_blob_container_name): | ||
|
||
# List the blobs in a training container | ||
blobs = [] | ||
marker = None | ||
blobs_size = 1 | ||
while True: | ||
batch = blob_service.list_blobs(azure_blob_training_data_container_name, marker=marker) | ||
blobs.extend(batch) | ||
if not batch.next_marker: | ||
break | ||
marker = batch.next_marker | ||
for blob in blobs: | ||
blobs_size += blob.properties.content_length | ||
print(blob.name) | ||
|
||
# the vm / thread count | ||
vm_thread_count = (int(cfg.node_count) - 1) * int(cfg.thread_count) | ||
|
||
# the file count per vm | ||
file_count = int(cfg.training_data_shred_count) // vm_thread_count | ||
|
||
# the file size | ||
file_size = blobs_size // int(cfg.training_data_shred_count) | ||
|
||
# data path directory | ||
dataset_local_directory = os.path.normpath(cfg.dataset_local_directory) | ||
|
||
#local shredded data directory | ||
shredded_dataset_local_directory = os.path.normpath(cfg.shredded_dataset_local_directory) | ||
shredded_dataset_Per_Node= os.path.normpath(cfg.shredded_dataset_Per_Node) | ||
|
||
# download data from training blob, slice it | ||
print('downloading dataset from blob and create them localy...') | ||
i = 0 | ||
for itr in range(len(blobs)): | ||
blob = blobs[itr] | ||
blob_service.get_blob_to_path(azure_blob_training_data_container_name, | ||
blob.name, os.path.join(dataset_local_directory, blob.name)) | ||
file_name_no_extension, file_extension = os.path.splitext(blob.name) | ||
|
||
lines_bytes_size = 0 | ||
alist = [] | ||
with open(os.path.join(dataset_local_directory, blob.name), 'r') as in_file: | ||
for line in in_file: | ||
lines_bytes_size += sys.getsizeof(line) | ||
alist.append(line) | ||
if(lines_bytes_size >= file_size): | ||
with open(os.path.join(shredded_dataset_local_directory, | ||
file_name_no_extension + '_' + str(itr) + '_' + str(i) + file_extension), 'w') as wr: | ||
for item in alist: | ||
wr.write(item) | ||
lines_bytes_size = 0 | ||
alist = [] | ||
i +=1 | ||
|
||
# combine shreded files into a one file per node | ||
alldatafiles = [f for f in listdir(shredded_dataset_local_directory) if isfile(join(shredded_dataset_local_directory, f))] | ||
low_index = 0 | ||
high_index = file_count | ||
filename = "data.lst" | ||
for vm_count in range(vm_thread_count): | ||
blob_name = cfg.trainind_dataset_name + "-" + "%05d" % (vm_count,) | ||
if(high_index > len(alldatafiles)): | ||
high_index = len(alldatafiles) | ||
if not os.path.exists(os.path.join(shredded_dataset_Per_Node, blob_name)): | ||
os.makedirs(os.path.join(shredded_dataset_Per_Node, blob_name)) | ||
with open(os.path.join(shredded_dataset_Per_Node, blob_name + '\\' + filename), 'w') as outfile: | ||
for itr in range(low_index, high_index): | ||
with open(os.path.join(shredded_dataset_local_directory, alldatafiles[itr])) as infile: | ||
for line in infile: | ||
outfile.write(line) | ||
low_index += file_count | ||
high_index += file_count | ||
|
||
# upload combined sliced data to blobs | ||
for subdir, dirs, files in os.walk(shredded_dataset_Per_Node): | ||
for file in files: | ||
print ( os.path.basename(subdir)) | ||
print (os.path.join(subdir, file)) | ||
blob_service.create_blob_from_path(azure_blob_container_name, os.path.basename(subdir) + '/' + file, os.path.join(subdir, file)) | ||
|
||
|
||
print('Done') | ||
|
||
# begin loading, partitioning and deploying training data | ||
partition_and_upload_dataset_to_blob(blob_service, azure_blob_container_name) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
## MADL-CPU-OpenMPI Data Shredding | ||
We included a python script that shows how to shred and deploy your training data prior to running a training job on Azure VMs via Open MPI. | ||
Azure VMs via Open MPI. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove this line since you modified above. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like you might have missed the prior comment: remove line 3. |
||
|
||
### Data Shredding Configuration | ||
Rename the configuration-template.json to configuration.json. The configuration should enable the following properties: | ||
* `node_count` should be set to the number of VMs in the compute pool. | ||
* `thread_count` thread's count per VM. | ||
* `training_data_shred_count` we advise to set this number high. This way you only do this step once, and use it for different VMs configuration. | ||
* 'dataset_local_directory' A local directory to download and shred the training data according to 'training_data_shred_count'. | ||
* 'shredded_dataset_Per_Node' A local directory to hold the final data shreds before deploying them to Azure blobs. | ||
* 'container_name' container name where the sliced data will be stored. | ||
* 'trainind_dataset_name' name for the dataset. Used when creating the data blobs. | ||
* 'subscription_id' Azure subscription id. | ||
* 'secret_key' Azure password. | ||
* 'resource_group' Resource group name. | ||
* 'storage_account' storage account name and access key. | ||
* 'training_data_container_name' Container name where the training data is hosted. | ||
*'' | ||
|
||
You can use your own access mechanism (password, access key, etc.). The above is only a one example. Although, make sure to update the python script | ||
every time you make a configuration change. | ||
|
||
You must agree to the following licenses prior to use: | ||
* [High Performance ML Algorithms License](https://github.com/saeedmaleki/Distributed-Linear-Learner/blob/master/High%20Performance%20ML%20Algorithms%20-%20Standalone%20(free)%20Use%20Terms%20V2%20(06-06-18).docx) | ||
* [TPN Ubuntu Container](https://github.com/saeedmaleki/Distributed-Linear-Learner/blob/master/TPN_Ubuntu%20Container_16-04-FINAL.docx) | ||
* [Microsoft Third Party Notice](https://github.com/saeedmaleki/Distributed-Linear-Learner/blob/master/MicrosoftThirdPartyNotice.txt) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
{ | ||
"node_count": <node count> | ||
"thread_count": <thread count per node>, | ||
"training_data_shred_count": <number of data shreds>, | ||
"dataset_local_directory": <local dir for training data>, | ||
"shredded_dataset_local_directory": <local dir for sliced data>, | ||
"shredded_dataset_Per_Node":<local directory for final shredded data per node> | ||
"container_name": <container name where the sliced data will be stored>, | ||
"trainind_dataset_name": <name for the dataset. Used when creating the data blobs, | ||
"subscription_id": <azure subscription id>, | ||
"secret_key": <password>, | ||
"resource_group": <resource group name>, | ||
"storage_account": { | ||
"name": <storeage account name>, | ||
"key": <storage key> | ||
}, | ||
"training_data_container_name": <container name where the training data is hosted> | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
# MADL-CPU-OpenMPI | ||
This recipe shows how to run High Performance ML Algorithms Learner on CPUs across | ||
Azure VMs via Open MPI. | ||
|
||
## Configuration | ||
Please see refer to this [set of sample configuration files](./config) for | ||
this recipe. | ||
|
||
### Pool Configuration | ||
The pool configuration should enable the following properties: | ||
* `vm_size` should be a CPU-only instance, for example, 'STANDARD_D2_V2'. | ||
* `inter_node_communication_enabled` must be set to `true` | ||
* `max_tasks_per_node` must be set to 1 or omitted | ||
|
||
### Global Configuration | ||
The global configuration should set the following properties: | ||
* `docker_images` array must have a reference to a valid MADL | ||
Docker image that can be run with OpenMPI. The image denoted with `0.0.1` tag found in [msmadl/symsgd:0.0.1](https://hub.docker.com/r/msmadl/symsgd/) | ||
is compatible with Azure Batch Shipyard VMs. | ||
|
||
### MPI Jobs Configuration (MultiNode) | ||
The jobs configuration should set the following properties within the `tasks` | ||
array which should have a task definition containing: | ||
* `docker_image` should be the name of the Docker image for this container invocation. | ||
For this example, this should be | ||
`msmadl/symsgd:0.0.1`. | ||
Please note that the `docker_images` in the Global Configuration should match | ||
this image name. | ||
* `command` should contain the command to pass to the Docker run invocation. | ||
For this example, we will run MADL training example in the `msmadl/symsgd:0.0.1` Docker image. The | ||
application `command` to run would be: | ||
`"/parasail/run_parasail.sh -w /parasail/supersgd -l 1e-4 -k 32 -m 1e-2 -e 10 -r 10 -f $AZ_BATCH_NODE_SHARED_DIR/azblob/<container_name from the data shredding configuration file> -t 1 -g 1 -d $AZ_BATCH_TASK_WORKING_DIR/models -b $AZ_BATCH_NODE_SHARED_DIR/azblob/<container_name from the data shredding configuration file>"` | ||
* [`run_parasail.sh`](docker/run_parasail.sh) has these parameters | ||
* `-w` the MADL superSGD directory | ||
* `-l` learning rate | ||
* `-k` approximation rank constant | ||
* `-m` model combiner convergence threshold | ||
* `-e` total epochs | ||
* `-r` rounds per epoch | ||
* `-f` training file prefix | ||
* `-t` number of threads | ||
* `-g` log global models every this many epochs | ||
* `-d` log global models to this directory at the host" | ||
* `-b` location for the algorithm's binary" | ||
|
||
* The training data will need to be shredded to match the number of VMs and the thread's count per VM, and then deployed to a mounted Azure blob that the VM docker images have read/write access. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You should provide the configuration example for this here, e.g.:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I included the part you added with a link to the full configuration file. |
||
We created a basic python script that can be used to shred and deploy the training data to a blob container identified by the user. | ||
Data shredding files can be found [here](./DataShredding). | ||
* `shared_data_volumes` should contain the shared data volume with an `azureblob` volume driver as specified in the global configuration file found [here](./config/config.yaml). | ||
|
||
* `multi_instance` property must be defined | ||
* `num_instances` should be set to `pool_current_dedicated`, or | ||
`pool_current_low_priority` | ||
|
||
## Dockerfile and supplementary files | ||
Supplementary files can be found [here](./docker). | ||
|
||
You must agree to the following licenses prior to use: | ||
* [High Performance ML Algorithms License](https://github.com/saeedmaleki/Distributed-Linear-Learner/blob/master/High%20Performance%20ML%20Algorithms%20-%20Standalone%20(free)%20Use%20Terms%20V2%20(06-06-18).docx) | ||
* [TPN Ubuntu Container](https://github.com/saeedmaleki/Distributed-Linear-Learner/blob/master/TPN_Ubuntu%20Container_16-04-FINAL.docx) | ||
* [Microsoft Third Party Notice](https://github.com/saeedmaleki/Distributed-Linear-Learner/blob/master/MicrosoftThirdPartyNotice.txt) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
batch_shipyard: | ||
storage_account_settings: mystorageaccount | ||
global_resources: | ||
docker_images: | ||
- msmadl/symsgd:0.0.1 | ||
volumes: | ||
shared_data_volumes: | ||
azureblob_vol: | ||
volume_driver: azureblob | ||
storage_account_settings: mystorageaccount | ||
azure_blob_container_name: <blob container name> | ||
container_path: $AZ_BATCH_NODE_SHARED_DIR/azblob | ||
bind_options: rw |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
credentials: | ||
batch: | ||
account_key: <batch account key> | ||
account_service_url: <batch account service url> | ||
storage: | ||
mystorageaccount: | ||
account: <storage account name> | ||
account_key: <storage account key> | ||
endpoint: core.windows.net |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
job_specifications: | ||
- id: <job id> | ||
auto_complete: true | ||
shared_data_volumes: | ||
- azureblob_vol | ||
tasks: | ||
- docker_image: msmadl/symsgd:0.0.1 | ||
multi_instance: | ||
num_instances: pool_current_dedicated | ||
command: /parasail/run_parasail.sh -w /parasail/supersgd -l 1e-4 -k 32 -m 1e-2 -e 10 -r 10 -f $AZ_BATCH_NODE_SHARED_DIR/azblob/<container_name from the data shredding configuration file> -t 1 -g 1 -d $AZ_BATCH_TASK_WORKING_DIR/models -b $AZ_BATCH_NODE_SHARED_DIR/azblob/<container_name from the data shredding configuration file> |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
pool_specification: | ||
id: <pool id> | ||
vm_configuration: | ||
platform_image: | ||
offer: UbuntuServer | ||
publisher: Canonical | ||
sku: 16.04-LTS | ||
vm_count: | ||
dedicated: 3 | ||
low_priority: 0 | ||
vm_size: STANDARD_D2_V2 | ||
inter_node_communication_enabled: true |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
#Dockerfile for MADL (Microsoft Distributed Learners) | ||
|
||
FROM ubuntu:16.04 | ||
MAINTAINER Saeed Maleki Todd Mytkowicz Madan Musuvathi Dany rouhana <https://github.com/Azure/batch-shipyard> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use the real URL here |
||
ENV DEBIAN_FRONTEND=noninteractive | ||
|
||
#install base system | ||
RUN apt-get update && apt-get install -y --no-install-recommends \ | ||
openssh-client \ | ||
openssh-server \ | ||
libopenblas-dev \ | ||
libatlas-base-dev \ | ||
liblapacke-dev \ | ||
openmpi-bin \ | ||
openmpi-common \ | ||
libopenmpi-dev && \ | ||
apt-get clean && \ | ||
rm -rf /var/lib/apt/lists/* | ||
|
||
# configure ssh server and keys | ||
RUN mkdir -p /root/.ssh && \ | ||
echo "Host *\n\tPort 23\n\tStrictHostKeyChecking no\n\tUserKnownHostsFile /dev/null" > /root/.ssh/config && \ | ||
mkdir /var/run/sshd && \ | ||
ssh-keygen -A && \ | ||
sed -i 's/PermitRootLogin without-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \ | ||
sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd && \ | ||
ssh-keygen -f /root/.ssh/id_rsa -t rsa -N '' && \ | ||
chmod 600 /root/.ssh/config && \ | ||
chmod 700 /root/.ssh && \ | ||
cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys | ||
|
||
# set parasail dir | ||
WORKDIR /parasail | ||
|
||
# to create your own image, first download the supersgd from the link supplied in the read me file, | ||
# and the put it in the same dir as this file. | ||
COPY supersgd /parasail | ||
COPY run_parasail.sh /parasail | ||
|
||
# remove romio314 bits | ||
RUN rm -rf /usr/lib/openmpi/lib/openmpi/mca_io_romio.so | ||
|
||
#make sshd listen on 23 and run by default | ||
EXPOSE 23 | ||
CMD ["/usr/sbin/sshd", "-D", "-p", "23"] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
# Dockerfile for msmadl/symsgd | ||
This image can be found in [Docker Hub](https://hub.docker.com/r/msmadl/symsgd/) | ||
|
||
You can use our algorithm by downloading our docker image (docker pull msmadl/symsgd). If you decide to build your own, you first need to download the High Performance ML Algorithms Learner binary from this [link](https://github.com/saeedmaleki/Distributed-Linear-Learner/blob/master/supersgd), and include it in the same directory as your Dockerfile. | ||
|
||
You must agree to the following licenses prior to use: | ||
* [High Performance ML Algorithms License](https://github.com/saeedmaleki/Distributed-Linear-Learner/blob/master/High%20Performance%20ML%20Algorithms%20-%20Standalone%20(free)%20Use%20Terms%20V2%20(06-06-18).docx) | ||
* [TPN Ubuntu Container](https://github.com/saeedmaleki/Distributed-Linear-Learner/blob/master/TPN_Ubuntu%20Container_16-04-FINAL.docx) | ||
* [Microsoft Third Party Notice](https://github.com/saeedmaleki/Distributed-Linear-Learner/blob/master/MicrosoftThirdPartyNotice.txt) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Avoid using "We". Perhaps reword as:
This Data Shredding recipe shows how to shred
...There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please do a quick search on all your markdown files for use of "we" and replace with something else.