Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Gpu support #3

Merged
merged 11 commits into from
Jul 27, 2016
20 changes: 13 additions & 7 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
FROM debian:jessie
FROM nvidia/cuda:7.5-cudnn5-runtime
RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.5 /usr/lib/x86_64-linux-gnu/libcudnn.so
RUN echo '\
deb mirror://mirrors.ubuntu.com/mirrors.txt trusty main\n\
deb mirror://mirrors.ubuntu.com/mirrors.txt trusty universe\n\
deb mirror://mirrors.ubuntu.com/mirrors.txt trusty-updates main\n'\
> /etc/apt/sources.list
RUN apt-get update && apt-get install --no-install-recommends -y \
g++ \
libopenblas-dev \
gcc \
libopenblas-base \
libzookeeper-mt-dev \
ca-certificates \
python-dev \
python-pip \
git-core && \
apt-get autoremove --purge -y && \
apt-get clean && \
rm -rf /var/cache/apt /var/lib/apt/lists
RUN pip install --download-cache="/tmp/pip" -r "https://raw.githubusercontent.com/douban/tfmesos/master/requirements.txt" && \
pip install --download-cache="/tmp/pip" "git+https://github.com/douban/tfmesos.git@master#egg=tfmesos" && \
rm -rf /tmp/*
RUN python -c 'import urllib2;exec(urllib2.urlopen("https://bootstrap.pypa.io/get-pip.py").read())' --no-cache-dir --timeout 1000 && \
pip install --no-cache-dir --timeout 1000 -r "https://raw.githubusercontent.com/douban/tfmesos/master/requirements.txt" && \
pip install --no-cache-dir --timeout 1000 "git+https://github.com/douban/tfmesos.git@master#egg=tfmesos"
6 changes: 3 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
argparse==1.2.1
google-common==0.0.1
numpy==1.10.4
numpy==1.11.1
protobuf==3.0.0b2
pymesos==0.1.2
pymesos==0.1.5
six==1.10.0
wheel==0.29.0
wsgiref==0.1.2
zkpython==0.4.2
-e git+https://github.com/douban/mesos.interface.git#egg=mesos.interface
https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.8.0-cp27-none-linux_x86_64.whl
https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.9.0-cp27-none-linux_x86_64.whl
68 changes: 61 additions & 7 deletions tfmesos/scheduler.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
import os
import re
import sys
import math
import select
import signal
import socket
import thread
import getpass
import logging
import urllib2
import textwrap
from mesos.interface import mesos_pb2, Scheduler
from pymesos import MesosSchedulerDriver
Expand All @@ -18,22 +21,24 @@

class Job(object):

def __init__(self, name, num, cpus=1.0, mem=1024.0):
def __init__(self, name, num, cpus=1.0, mem=1024.0, gpus=0):
self.name = name
self.num = num
self.cpus = cpus
self.gpus = gpus
self.mem = mem


class Task(object):

def __init__(self, mesos_task_id, job_name, task_index,
cpus=1.0, mem=1024.0, volumes={}):
cpus=1.0, mem=1024.0, gpus=0, volumes={}):
self.mesos_task_id = mesos_task_id
self.job_name = job_name
self.task_index = task_index

self.cpus = cpus
self.gpus = gpus
self.mem = mem
self.volumes = volumes
self.offered = False
Expand All @@ -49,7 +54,7 @@ def __str__(self):
addr=%s
>''' % (self.mesos_task_id, self.addr))

def to_task_info(self, offer, master_addr):
def to_task_info(self, offer, master_addr, gpu_uuids=[]):
ti = mesos_pb2.TaskInfo()
ti.task_id.value = str(self.mesos_task_id)
ti.slave_id.value = offer.slave_id.value
Expand All @@ -65,9 +70,11 @@ def to_task_info(self, offer, master_addr):
mem.type = mesos_pb2.Value.SCALAR
mem.scalar.value = self.mem

if 'DOCKER_IMAGE' in os.environ:
image = os.environ.get('DOCKER_IMAGE')

if image is not None:
ti.container.type = mesos_pb2.ContainerInfo.DOCKER
ti.container.docker.image = os.environ['DOCKER_IMAGE']
ti.container.docker.image = image

for path in ['/etc/passwd', '/etc/group']:
v = ti.container.volumes.add()
Expand All @@ -80,6 +87,41 @@ def to_task_info(self, offer, master_addr):
v.host_path = src
v.mode = mesos_pb2.Volume.RW

if self.gpus and gpu_uuids:
hostname = offer.hostname
url = 'http://%s:3476/docker/cli?dev=%s' % (
hostname, urllib2.quote(
' '.join(gpu_uuids)
)
)

try:
docker_args = urllib2.urlopen(url).read()
for arg in docker_args.split():
k, v = arg.split('=')
assert k.startswith('--')
k = k[2:]
p = ti.container.docker.parameters.add()
p.key = k
p.value = v

gpus = ti.resources.add()
gpus.name = 'gpus'
gpus.type = mesos_pb2.Value.SET
gpus.set.item.extend(gpu_uuids)
except Exception:
logger.exception(
'fail to determine remote device parameter,'
' disable gpu resources'
)

else:
if self.gpus and gpu_uuids:
gpus = ti.resources.add()
gpus.name = 'gpus'
gpus.type = mesos_pb2.Value.SET
gpus.set.item.extend(gpu_uuids)

ti.command.shell = True
cmd = [
sys.executable, "-m", "%s.server" % __package__,
Expand Down Expand Up @@ -136,6 +178,7 @@ def __init__(self, task_spec, master=None, name=None, quiet=False,
task_index,
cpus=job.cpus,
mem=job.mem,
gpus=job.gpus,
volumes=volumes,
)
)
Expand All @@ -155,27 +198,37 @@ def resourceOffers(self, driver, offers):
continue

offered_cpus = offered_mem = 0.0
offered_gpus = []
offered_tasks = []

for resource in offer.resources:
if resource.name == "cpus":
offered_cpus = resource.scalar.value
elif resource.name == "mem":
offered_mem = resource.scalar.value
elif resource.name == "gpus":
offered_gpus = resource.set.item

for task in self.tasks:
if task.offered:
continue

if not (task.cpus <= offered_cpus and
task.mem <= offered_mem):
task.mem <= offered_mem and
task.gpus <= len(offered_gpus)):

continue

offered_cpus -= task.cpus
offered_mem -= task.mem
gpus = int(math.ceil(task.gpus))
gpu_uuids = offered_gpus[:gpus]
offered_gpus = offered_gpus[gpus:]
task.offered = True
offered_tasks.append(task.to_task_info(offer, self.addr))
offered_tasks.append(
task.to_task_info(
offer, self.addr,
gpu_uuids=gpu_uuids))

driver.launchTasks(offer.id, offered_tasks, mesos_pb2.Filters())

Expand All @@ -195,6 +248,7 @@ def _start_tf_cluster(self):
"task_index": task.task_index,
"cpus": task.cpus,
"mem": task.mem,
"gpus": task.gpus,
"cluster_def": cluster_def,
}
send(task.connection, response)
Expand Down
3 changes: 2 additions & 1 deletion tfmesos/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def main(argv):
task_index = response["task_index"]
cpus = response["cpus"]
mem = response["mem"]
gpus = response["gpus"]

server_def = tf.train.ServerDef(
cluster=tf.train.ClusterSpec(cluster_def).as_cluster_def(),
Expand All @@ -38,7 +39,7 @@ def main(argv):
)

server_def.default_session_config.device_count["CPU"] = int(cpus)
server_def.default_session_config.device_count["GPU"] = 0
server_def.default_session_config.device_count["GPU"] = int(gpus)
(soft, hard) = resource.getrlimit(resource.RLIMIT_AS)
soft = min(float(mem), soft, hard)
resource.setrlimit(resource.RLIMIT_AS, (soft, hard))
Expand Down