Skip to content
This repository was archived by the owner on Sep 3, 2022. It is now read-only.

Commit 07b59e5

Browse files
committed
Datalab "ml" magics for running a solution package. Update Inception Package. (#121)
* Datalab Inception (image classification) solution. * Fix dataflow URL. * Datalab "ml" magics for running a solution package. - Dump function args and docstrings - Run functions Update Inception Package. - Added docstring on face functions. - Added batch prediction. - Use datalab's lib for talking to cloud training and prediction service. - More minor fixes and changes. * Follow up code review comments. * Fix an PackageRunner issue that temp installation is done multiple times unnecessarily.
1 parent 24f1a6e commit 07b59e5

File tree

13 files changed

+536
-95
lines changed

13 files changed

+536
-95
lines changed

datalab/mlalpha/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
from ._confusion_matrix import ConfusionMatrix
2929
from ._analysis import CsvEvalResults, CsvEvalSource, EvalResultsCsvCoder, \
3030
AccuracyFn, FeatureSlicingPipeline
31+
from ._package_runner import PackageRunner
3132

3233
from plotly.offline import init_notebook_mode
3334

datalab/mlalpha/_package_runner.py

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
# Copyright 2017 Google Inc. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
4+
# in compliance with the License. You may obtain a copy of the License at
5+
#
6+
# http://www.apache.org/licenses/LICENSE-2.0
7+
#
8+
# Unless required by applicable law or agreed to in writing, software distributed under the License
9+
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
10+
# or implied. See the License for the specific language governing permissions and limitations under
11+
# the License.
12+
13+
"""Implements running Datalab ML Solution Packages."""
14+
15+
import inspect
16+
import google.cloud.ml as ml
17+
import os
18+
import shutil
19+
import subprocess
20+
import sys
21+
import tempfile
22+
23+
24+
PACKAGE_NAMESPACE = 'datalab_solutions'
25+
26+
class PackageRunner(object):
27+
"""A Helper class to run Datalab ML solution packages."""
28+
29+
def __init__(self, package_uri):
30+
"""
31+
Args:
32+
package_uri: The uri of the package. The file base name needs to be in the form of
33+
"name-version", such as "inception-0.1". The first part split by "-" will be used
34+
as the last part of the namespace. In the example above,
35+
"datalab_solutions.inception" will be the namespace.
36+
"""
37+
self._package_uri = package_uri
38+
self._name = os.path.basename(package_uri).split('-')[0]
39+
self._install_dir = None
40+
41+
def _install_to_temp(self):
42+
install_dir = tempfile.mkdtemp()
43+
tar_path = self._package_uri
44+
if tar_path.startswith('gs://'):
45+
tar_path = os.path.join(install_dir, os.path.basename(tar_path))
46+
ml.util._file.copy_file(self._package_uri, tar_path)
47+
subprocess.check_call(['pip', 'install', tar_path, '--target', install_dir,
48+
'--upgrade', '--force-reinstall'])
49+
sys.path.insert(0, install_dir)
50+
self._install_dir = install_dir
51+
52+
def __enter__(self):
53+
self._install_to_temp()
54+
return self
55+
56+
def __exit__(self, exc_type, exc_value, traceback):
57+
self._cleanup_installation()
58+
59+
def _cleanup_installation(self):
60+
if self._install_dir is None:
61+
return
62+
if sys.path[0] == self._install_dir:
63+
del sys.path[0]
64+
shutil.rmtree(self._install_dir)
65+
66+
def get_func_args_and_docstring(self, func_name):
67+
"""Get function args and docstrings.
68+
Args:
69+
func_name: name of the function.
70+
Returns:
71+
A tuple of function argspec, function docstring.
72+
"""
73+
func = getattr(__import__(PACKAGE_NAMESPACE + '.' + self._name, fromlist=[func_name]),
74+
func_name)
75+
return inspect.getargspec(func), func.__doc__
76+
77+
def run_func(self, func_name, args):
78+
"""Run a function.
79+
Args:
80+
func_name: name of the function.
81+
args: args supplied to the functions.
82+
Returns:
83+
function return values.
84+
"""
85+
func = getattr(__import__(PACKAGE_NAMESPACE + '.' + self._name, fromlist=[func_name]),
86+
func_name)
87+
return func(**args)

datalab/mlalpha/commands/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,5 +13,6 @@
1313

1414
from __future__ import absolute_import
1515

16+
from . import _ml
1617
from . import _mlalpha
1718
from . import _tensorboard

datalab/mlalpha/commands/_ml.py

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
# Copyright 2017 Google Inc. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
4+
# in compliance with the License. You may obtain a copy of the License at
5+
#
6+
# http://www.apache.org/licenses/LICENSE-2.0
7+
#
8+
# Unless required by applicable law or agreed to in writing, software distributed under the License
9+
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
10+
# or implied. See the License for the specific language governing permissions and limitations under
11+
# the License.
12+
13+
try:
14+
import IPython
15+
import IPython.core.magic
16+
except ImportError:
17+
raise Exception('This module can only be loaded in ipython.')
18+
19+
import collections
20+
import os
21+
import yaml
22+
23+
import datalab.context
24+
import datalab.mlalpha
25+
import datalab.utils.commands
26+
27+
28+
@IPython.core.magic.register_line_cell_magic
29+
def ml(line, cell=None):
30+
"""Implements the ml line cell magic.
31+
32+
Args:
33+
line: the contents of the ml line.
34+
cell: the contents of the ml cell.
35+
36+
Returns:
37+
The results of executing the cell.
38+
"""
39+
parser = datalab.utils.commands.CommandParser(prog="ml", description="""
40+
Execute various ml-related operations. Use "%%ml <command> -h" for help on a specific command.
41+
""")
42+
preprocess_parser = parser.subcommand('preprocess', 'Run a preprocess job.')
43+
preprocess_parser.add_argument('--usage',
44+
help='Show usage from the specified preprocess package.',
45+
action='store_true', default=False)
46+
preprocess_parser.add_argument('--cloud',
47+
help='Whether to run the preprocessing job in the cloud.',
48+
action='store_true', default=False)
49+
preprocess_parser.add_argument('--package',
50+
help='The preprocess package to use. Can be a gs or local path.',
51+
required=True)
52+
preprocess_parser.set_defaults(func=_preprocess)
53+
54+
train_parser = parser.subcommand('train', 'Train an ML model.')
55+
train_parser.add_argument('--usage',
56+
help='Show usage from the specified trainer package',
57+
action='store_true', default=False)
58+
train_parser.add_argument('--cloud',
59+
help='Whether to run the training job in the cloud.',
60+
action='store_true', default=False)
61+
train_parser.add_argument('--package',
62+
help='The trainer package to use. Can be a gs or local path.',
63+
required=True)
64+
train_parser.set_defaults(func=_train)
65+
66+
predict_parser = parser.subcommand('predict', 'Predict with an ML model.')
67+
predict_parser.add_argument('--usage',
68+
help='Show usage from the specified prediction package',
69+
action='store_true', default=False)
70+
predict_parser.add_argument('--cloud',
71+
help='Whether to run prediction in the cloud.',
72+
action='store_true', default=False)
73+
predict_parser.add_argument('--package',
74+
help='The prediction package to use. Can be a gs or local path.',
75+
required=True)
76+
predict_parser.set_defaults(func=_predict)
77+
78+
batch_predict_parser = parser.subcommand('batch_predict', 'Batch predict with an ML model.')
79+
batch_predict_parser.add_argument('--usage',
80+
help='Show usage from the specified prediction package',
81+
action='store_true', default=False)
82+
batch_predict_parser.add_argument('--cloud',
83+
help='Whether to run prediction in the cloud.',
84+
action='store_true', default=False)
85+
batch_predict_parser.add_argument('--package',
86+
help='The prediction package to use. Can be a gs or local path.',
87+
required=True)
88+
batch_predict_parser.set_defaults(func=_batch_predict)
89+
90+
namespace = datalab.utils.commands.notebook_environment()
91+
return datalab.utils.commands.handle_magic_line(line, cell, parser, namespace=namespace)
92+
93+
94+
def _command_template(pr, func_name):
95+
"""Return (args_list, docstring).
96+
args_list is in the form of:
97+
arg1:
98+
arg2:
99+
arg3: (optional)
100+
"""
101+
argspec, docstring = pr.get_func_args_and_docstring(func_name)
102+
num_defaults = len(argspec.defaults) if argspec.defaults is not None else 0
103+
# Need to fill in a keyword (here '(NOT_OP)') for non optional args.
104+
# Later we will replace '(NOT_OP)' with empty string.
105+
optionals = ['(NOT_OP)'] * (len(argspec.args) - num_defaults) + \
106+
['(optional)'] * num_defaults
107+
args = dict(zip(argspec.args, optionals))
108+
args_dump = yaml.safe_dump(args, default_flow_style=False).replace('(NOT_OP)', '')
109+
return args_dump, docstring
110+
111+
112+
def _run_package(args, cell, mode):
113+
local_func_name = 'local_' + mode
114+
cloud_func_name = 'cloud_' + mode
115+
with datalab.mlalpha.PackageRunner(args['package']) as pr:
116+
if args['usage'] is True:
117+
#TODO Consider calling _command_template once to save one pip installation
118+
command_local = """%%ml %s --package %s""" % (mode, args['package'])
119+
args_local, docstring_local = _command_template(pr, local_func_name)
120+
command_cloud = """%%ml %s --package %s --cloud""" % (mode, args['package'])
121+
args_cloud, docstring_cloud = _command_template(pr, cloud_func_name)
122+
output = """
123+
Local Run Command:
124+
125+
%s
126+
%s
127+
[Description]:
128+
%s
129+
130+
Cloud Run Command:
131+
132+
%s
133+
%s
134+
[Description]:
135+
%s
136+
""" % (command_local, args_local, docstring_local, command_cloud, args_cloud, docstring_cloud)
137+
return datalab.utils.commands.render_text(output, preformatted=True)
138+
139+
env = datalab.utils.commands.notebook_environment()
140+
func_args = datalab.utils.commands.parse_config(cell, env)
141+
if args['cloud'] is True:
142+
return pr.run_func(cloud_func_name, func_args)
143+
else:
144+
return pr.run_func(local_func_name, func_args)
145+
146+
147+
def _preprocess(args, cell):
148+
return _run_package(args, cell, 'preprocess')
149+
150+
151+
def _train(args, cell):
152+
return _run_package(args, cell, 'train')
153+
154+
155+
def _predict(args, cell):
156+
return _run_package(args, cell, 'predict')
157+
158+
159+
def _batch_predict(args, cell):
160+
return _run_package(args, cell, 'batch_predict')

datalab/mlalpha/commands/_mlalpha.py

Lines changed: 37 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
import urllib
2929
import yaml
3030

31+
import datalab.bigquery as bq
3132
import datalab.context
3233
import datalab.data
3334
import datalab.mlalpha
@@ -158,16 +159,19 @@ def mlalpha(line, cell=None):
158159
package_parser.add_argument('--output', help='the output dir of the package.', required=True)
159160
package_parser.set_defaults(func=_package)
160161

161-
package_parser = parser.subcommand('feature-slice-view','View results of a ' +
162+
feature_slice_parser = parser.subcommand('feature-slice-view','View results of a ' +
162163
'FeatureSlicingPipeline, some eval metrics grouped by ' +
163164
'specified feature column values')
164-
package_parser.add_argument('--file', help='The results file from FeatureSlicingPipeline',
165-
required=True)
166-
package_parser.add_argument('--feature',
167-
help='Which feature to view. The feature must be specified ' +
168-
'in the FeatureSlicingPipeline. If not specified, all ' +
169-
'features will be listed.')
170-
package_parser.set_defaults(func=_feature_slice_view)
165+
feature_slice_parser.add_argument('--file', help='The results file from FeatureSlicingPipeline')
166+
feature_slice_parser.add_argument('--sql',
167+
help='The sql module which should return "feature",' +
168+
'"count" columns, plus at least one metric column ' +
169+
'with any names')
170+
feature_slice_parser.add_argument('--feature',
171+
help='Which feature to view. The feature must be specified ' +
172+
'in the FeatureSlicingPipeline. If not specified, all ' +
173+
'features will be listed.')
174+
feature_slice_parser.set_defaults(func=_feature_slice_view)
171175

172176
namespace = datalab.utils.commands.notebook_environment()
173177
return datalab.utils.commands.handle_magic_line(line, cell, parser, namespace=namespace)
@@ -985,6 +989,20 @@ def _package(args, cell):
985989
print 'Package created at %s.' % dest
986990

987991

992+
def _get_lantern_format(df):
993+
if ('count' not in df) or ('feature' not in df):
994+
raise Exception('No "count" or "feature" found in data.')
995+
metric_names = list(set(df) - set(['feature']))
996+
data = []
997+
for ii, row in df.iterrows():
998+
metric_values = dict(row)
999+
metric_values['totalWeightedExamples'] = metric_values['count']
1000+
del metric_values['feature']
1001+
del metric_values['count']
1002+
data.append({'feature': row['feature'], 'metricValues': metric_values})
1003+
return data
1004+
1005+
9881006
def _feature_slice_view(args, cell):
9891007
HTML_TEMPLATE = """<link rel="import" href="/nbextensions/gcpdatalab/extern/lantern-browser.html" >
9901008
<lantern-browser id="%s"></lantern-browser>
@@ -996,10 +1014,17 @@ def _feature_slice_view(args, cell):
9961014
browser.weightedExamplesColumn = 'totalWeightedExamples';
9971015
browser.calibrationPlotUriFn = function(s) { return '/' + s; }
9981016
</script>"""
999-
with open(args['file']) as f:
1000-
data = map(json.loads, f)
1001-
if args['feature']:
1002-
data = [e for e in data if e['feature'].split(':')[0] == args['feature']]
1017+
if args['sql'] is not None:
1018+
item = datalab.utils.commands.get_notebook_item(args['sql'])
1019+
item, _ = datalab.data.SqlModule.get_sql_statement_with_environment(item, {})
1020+
query = datalab.bigquery.Query(item)
1021+
df = query.results().to_dataframe()
1022+
data = _get_lantern_format(df)
1023+
elif args['dataframe'] is not None:
1024+
item = datalab.utils.commands.get_notebook_item(args['dataframe'])
1025+
data = _get_lantern_format(item)
1026+
else:
1027+
raise Exception('either --sql or --dataframe is needed.')
10031028
metrics_str = str(map(str, data[0]['metricValues'].keys()))
10041029
data_str = str([{str(k): json.dumps(v) for k,v in elem.iteritems()} for elem in data])
10051030
html_id = 'l' + datalab.utils.commands.Html.next_id()

solutionbox/inception/datalab_solutions/inception/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,5 @@
1111
# the License.
1212

1313

14-
from ._package import local_preprocess, cloud_preprocess, local_train, cloud_train, local_predict, cloud_predict
14+
from ._package import local_preprocess, cloud_preprocess, local_train, cloud_train, local_predict, \
15+
cloud_predict, local_batch_predict, cloud_batch_predict

0 commit comments

Comments
 (0)