Skip to content
This repository was archived by the owner on Sep 3, 2022. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions datalab/mlalpha/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from ._confusion_matrix import ConfusionMatrix
from ._analysis import CsvEvalResults, CsvEvalSource, EvalResultsCsvCoder, \
AccuracyFn, FeatureSlicingPipeline
from ._package_runner import PackageRunner

from plotly.offline import init_notebook_mode

Expand Down
87 changes: 87 additions & 0 deletions datalab/mlalpha/_package_runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# Copyright 2017 Google Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied. See the License for the specific language governing permissions and limitations under
# the License.

"""Implements running Datalab ML Solution Packages."""

import inspect
import google.cloud.ml as ml
import os
import shutil
import subprocess
import sys
import tempfile


PACKAGE_NAMESPACE = 'datalab_solutions'

class PackageRunner(object):
"""A Helper class to run Datalab ML solution packages."""

def __init__(self, package_uri):
"""
Args:
package_uri: The uri of the package. The file base name needs to be in the form of
"name-version", such as "inception-0.1". The first part split by "-" will be used
as the last part of the namespace. In the example above,
"datalab_solutions.inception" will be the namespace.
"""
self._package_uri = package_uri
self._name = os.path.basename(package_uri).split('-')[0]
self._install_dir = None

def _install_to_temp(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a todo saying to do something better than installing/deleting the package every time a command is run

Oh, and this is done twice when --usage is used.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not horribly slow but it may be different if we add dependencies in setup.py.

It is important that each function call needs to be stateless. Otherwise it is really hard to manage the session and control when to clean up. Indeed the --usage can be further improved to call it once. Added todo for that.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I implemented enter and exit functions so we can use it in a with statement. One call for "--usage" now.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I love the idea, but I don't think you did it

  1. exit should call _cleanup_installation
  2. I expected enter to be the only function that would call _install_to_temp
  3. get_func_args_and_docstring (and the other one) still call _install_to_temp(). So the --usage case is still running things twice I think.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed! Fixed. PTAL.

install_dir = tempfile.mkdtemp()
tar_path = self._package_uri
if tar_path.startswith('gs://'):
tar_path = os.path.join(install_dir, os.path.basename(tar_path))
ml.util._file.copy_file(self._package_uri, tar_path)
subprocess.check_call(['pip', 'install', tar_path, '--target', install_dir,
'--upgrade', '--force-reinstall'])
sys.path.insert(0, install_dir)
self._install_dir = install_dir

def __enter__(self):
self._install_to_temp()
return self

def __exit__(self, exc_type, exc_value, traceback):
self._cleanup_installation()

def _cleanup_installation(self):
if self._install_dir is None:
return
if sys.path[0] == self._install_dir:
del sys.path[0]
shutil.rmtree(self._install_dir)

def get_func_args_and_docstring(self, func_name):
"""Get function args and docstrings.
Args:
func_name: name of the function.
Returns:
A tuple of function argspec, function docstring.
"""
func = getattr(__import__(PACKAGE_NAMESPACE + '.' + self._name, fromlist=[func_name]),
func_name)
return inspect.getargspec(func), func.__doc__

def run_func(self, func_name, args):
"""Run a function.
Args:
func_name: name of the function.
args: args supplied to the functions.
Returns:
function return values.
"""
func = getattr(__import__(PACKAGE_NAMESPACE + '.' + self._name, fromlist=[func_name]),
func_name)
return func(**args)
1 change: 1 addition & 0 deletions datalab/mlalpha/commands/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,6 @@

from __future__ import absolute_import

from . import _ml
from . import _mlalpha
from . import _tensorboard
160 changes: 160 additions & 0 deletions datalab/mlalpha/commands/_ml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
# Copyright 2017 Google Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied. See the License for the specific language governing permissions and limitations under
# the License.

try:
import IPython
import IPython.core.magic
except ImportError:
raise Exception('This module can only be loaded in ipython.')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This -> IPython


import collections
import os
import yaml

import datalab.context
import datalab.mlalpha
import datalab.utils.commands


@IPython.core.magic.register_line_cell_magic
def ml(line, cell=None):
"""Implements the ml line cell magic.

Args:
line: the contents of the ml line.
cell: the contents of the ml cell.

Returns:
The results of executing the cell.
"""
parser = datalab.utils.commands.CommandParser(prog="ml", description="""
Execute various ml-related operations. Use "%%ml <command> -h" for help on a specific command.
""")
preprocess_parser = parser.subcommand('preprocess', 'Run a preprocess job.')
preprocess_parser.add_argument('--usage',
help='Show usage from the specified preprocess package.',
action='store_true', default=False)
preprocess_parser.add_argument('--cloud',
help='Whether to run the preprocessing job in the cloud.',
action='store_true', default=False)
preprocess_parser.add_argument('--package',
help='The preprocess package to use. Can be a gs or local path.',
required=True)
preprocess_parser.set_defaults(func=_preprocess)

train_parser = parser.subcommand('train', 'Train an ML model.')
train_parser.add_argument('--usage',
help='Show usage from the specified trainer package',
action='store_true', default=False)
train_parser.add_argument('--cloud',
help='Whether to run the training job in the cloud.',
action='store_true', default=False)
train_parser.add_argument('--package',
help='The trainer package to use. Can be a gs or local path.',
required=True)
train_parser.set_defaults(func=_train)

predict_parser = parser.subcommand('predict', 'Predict with an ML model.')
predict_parser.add_argument('--usage',
help='Show usage from the specified prediction package',
action='store_true', default=False)
predict_parser.add_argument('--cloud',
help='Whether to run prediction in the cloud.',
action='store_true', default=False)
predict_parser.add_argument('--package',
help='The prediction package to use. Can be a gs or local path.',
required=True)
predict_parser.set_defaults(func=_predict)

batch_predict_parser = parser.subcommand('batch_predict', 'Batch predict with an ML model.')
batch_predict_parser.add_argument('--usage',
help='Show usage from the specified prediction package',
action='store_true', default=False)
batch_predict_parser.add_argument('--cloud',
help='Whether to run prediction in the cloud.',
action='store_true', default=False)
batch_predict_parser.add_argument('--package',
help='The prediction package to use. Can be a gs or local path.',
required=True)
batch_predict_parser.set_defaults(func=_batch_predict)

namespace = datalab.utils.commands.notebook_environment()
return datalab.utils.commands.handle_magic_line(line, cell, parser, namespace=namespace)


def _command_template(pr, func_name):
"""Return (args_list, docstring).
args_list is in the form of:
arg1:
arg2:
arg3: (optional)
"""
argspec, docstring = pr.get_func_args_and_docstring(func_name)
num_defaults = len(argspec.defaults) if argspec.defaults is not None else 0
# Need to fill in a keyword (here '(NOT_OP)') for non optional args.
# Later we will replace '(NOT_OP)' with empty string.
optionals = ['(NOT_OP)'] * (len(argspec.args) - num_defaults) + \
['(optional)'] * num_defaults
args = dict(zip(argspec.args, optionals))
args_dump = yaml.safe_dump(args, default_flow_style=False).replace('(NOT_OP)', '')
return args_dump, docstring


def _run_package(args, cell, mode):
local_func_name = 'local_' + mode
cloud_func_name = 'cloud_' + mode
with datalab.mlalpha.PackageRunner(args['package']) as pr:
if args['usage'] is True:
#TODO Consider calling _command_template once to save one pip installation
command_local = """%%ml %s --package %s""" % (mode, args['package'])
args_local, docstring_local = _command_template(pr, local_func_name)
command_cloud = """%%ml %s --package %s --cloud""" % (mode, args['package'])
args_cloud, docstring_cloud = _command_template(pr, cloud_func_name)
output = """
Local Run Command:

%s
%s
[Description]:
%s

Cloud Run Command:

%s
%s
[Description]:
%s
""" % (command_local, args_local, docstring_local, command_cloud, args_cloud, docstring_cloud)
return datalab.utils.commands.render_text(output, preformatted=True)

env = datalab.utils.commands.notebook_environment()
func_args = datalab.utils.commands.parse_config(cell, env)
if args['cloud'] is True:
return pr.run_func(cloud_func_name, func_args)
else:
return pr.run_func(local_func_name, func_args)


def _preprocess(args, cell):
return _run_package(args, cell, 'preprocess')


def _train(args, cell):
return _run_package(args, cell, 'train')


def _predict(args, cell):
return _run_package(args, cell, 'predict')


def _batch_predict(args, cell):
return _run_package(args, cell, 'batch_predict')
49 changes: 37 additions & 12 deletions datalab/mlalpha/commands/_mlalpha.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import urllib
import yaml

import datalab.bigquery as bq
import datalab.context
import datalab.data
import datalab.mlalpha
Expand Down Expand Up @@ -159,16 +160,19 @@ def mlalpha(line, cell=None):
package_parser.add_argument('--output', help='the output dir of the package.', required=True)
package_parser.set_defaults(func=_package)

package_parser = parser.subcommand('feature-slice-view','View results of a ' +
feature_slice_parser = parser.subcommand('feature-slice-view','View results of a ' +
'FeatureSlicingPipeline, some eval metrics grouped by ' +
'specified feature column values')
package_parser.add_argument('--file', help='The results file from FeatureSlicingPipeline',
required=True)
package_parser.add_argument('--feature',
help='Which feature to view. The feature must be specified ' +
'in the FeatureSlicingPipeline. If not specified, all ' +
'features will be listed.')
package_parser.set_defaults(func=_feature_slice_view)
feature_slice_parser.add_argument('--file', help='The results file from FeatureSlicingPipeline')
feature_slice_parser.add_argument('--sql',
help='The sql module which should return "feature",' +
'"count" columns, plus at least one metric column ' +
'with any names')
feature_slice_parser.add_argument('--feature',
help='Which feature to view. The feature must be specified ' +
'in the FeatureSlicingPipeline. If not specified, all ' +
'features will be listed.')
feature_slice_parser.set_defaults(func=_feature_slice_view)

namespace = datalab.utils.commands.notebook_environment()
return datalab.utils.commands.handle_magic_line(line, cell, parser, namespace=namespace)
Expand Down Expand Up @@ -986,6 +990,20 @@ def _package(args, cell):
print 'Package created at %s.' % dest


def _get_lantern_format(df):
if ('count' not in df) or ('feature' not in df):
raise Exception('No "count" or "feature" found in data.')
metric_names = list(set(df) - set(['feature']))
data = []
for ii, row in df.iterrows():
metric_values = dict(row)
metric_values['totalWeightedExamples'] = metric_values['count']
del metric_values['feature']
del metric_values['count']
data.append({'feature': row['feature'], 'metricValues': metric_values})
return data


def _feature_slice_view(args, cell):
HTML_TEMPLATE = """<link rel="import" href="/nbextensions/gcpdatalab/extern/lantern-browser.html" >
<lantern-browser id="%s"></lantern-browser>
Expand All @@ -997,10 +1015,17 @@ def _feature_slice_view(args, cell):
browser.weightedExamplesColumn = 'totalWeightedExamples';
browser.calibrationPlotUriFn = function(s) { return '/' + s; }
</script>"""
with open(args['file']) as f:
data = map(json.loads, f)
if args['feature']:
data = [e for e in data if e['feature'].split(':')[0] == args['feature']]
if args['sql'] is not None:
item = datalab.utils.commands.get_notebook_item(args['sql'])
item, _ = datalab.data.SqlModule.get_sql_statement_with_environment(item, {})
query = datalab.bigquery.Query(item)
df = query.results().to_dataframe()
data = _get_lantern_format(df)
elif args['dataframe'] is not None:
item = datalab.utils.commands.get_notebook_item(args['dataframe'])
data = _get_lantern_format(item)
else:
raise Exception('either --sql or --dataframe is needed.')
metrics_str = str(map(str, data[0]['metricValues'].keys()))
data_str = str([{str(k): json.dumps(v) for k,v in elem.iteritems()} for elem in data])
html_id = 'l' + datalab.utils.commands.Html.next_id()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@
# the License.


from ._package import local_preprocess, cloud_preprocess, local_train, cloud_train, local_predict, cloud_predict
from ._package import local_preprocess, cloud_preprocess, local_train, cloud_train, local_predict, \
cloud_predict, local_batch_predict, cloud_batch_predict
Loading