googledatalab · qimingj · Jan 13, 2017 · Jan 3, 2017 · Jan 3, 2017 · Jan 11, 2017
diff --git a/datalab/mlalpha/__init__.py b/datalab/mlalpha/__init__.py
@@ -28,6 +28,7 @@
 from ._confusion_matrix import ConfusionMatrix
 from ._analysis import CsvEvalResults, CsvEvalSource, EvalResultsCsvCoder, \
                        AccuracyFn, FeatureSlicingPipeline
+from ._package_runner import PackageRunner
 
 from plotly.offline import init_notebook_mode
 

diff --git a/datalab/mlalpha/_package_runner.py b/datalab/mlalpha/_package_runner.py
@@ -0,0 +1,87 @@
+# Copyright 2017 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+
+"""Implements running Datalab ML Solution Packages."""
+
+import inspect
+import google.cloud.ml as ml
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+
+
+PACKAGE_NAMESPACE = 'datalab_solutions'
+
+class PackageRunner(object):
+  """A Helper class to run Datalab ML solution packages."""
+
+  def __init__(self, package_uri):
+    """
+    Args:
+      package_uri: The uri of the package. The file base name needs to be in the form of
+          "name-version", such as "inception-0.1". The first part split by "-" will be used
+          as the last part of the namespace. In the example above,
+          "datalab_solutions.inception" will be the namespace.
+    """
+    self._package_uri = package_uri
+    self._name = os.path.basename(package_uri).split('-')[0]
+    self._install_dir = None
+
+  def _install_to_temp(self):
+    install_dir = tempfile.mkdtemp()
+    tar_path = self._package_uri
+    if tar_path.startswith('gs://'):
+      tar_path = os.path.join(install_dir, os.path.basename(tar_path))
+      ml.util._file.copy_file(self._package_uri, tar_path)
+    subprocess.check_call(['pip', 'install', tar_path, '--target', install_dir,
+                           '--upgrade', '--force-reinstall'])
+    sys.path.insert(0, install_dir)
+    self._install_dir = install_dir
+
+  def __enter__(self):
+    self._install_to_temp()
+    return self
+
+  def __exit__(self, exc_type, exc_value, traceback):
+    self._cleanup_installation()
+
+  def _cleanup_installation(self):
+    if self._install_dir is None:
+      return
+    if sys.path[0] == self._install_dir:
+      del sys.path[0]
+    shutil.rmtree(self._install_dir)
+
+  def get_func_args_and_docstring(self, func_name):
+    """Get function args and docstrings.
+    Args:
+      func_name: name of the function.
+    Returns:
+      A tuple of function argspec, function docstring.
+    """
+    func = getattr(__import__(PACKAGE_NAMESPACE + '.' + self._name, fromlist=[func_name]),
+                   func_name)
+    return inspect.getargspec(func), func.__doc__   
+
+  def run_func(self, func_name, args):
+    """Run a function.
+    Args:
+      func_name: name of the function.
+      args: args supplied to the functions.
+    Returns:
+      function return values.
+    """
+    func = getattr(__import__(PACKAGE_NAMESPACE + '.' + self._name, fromlist=[func_name]),
+                   func_name)
+    return func(**args)
diff --git a/datalab/mlalpha/commands/__init__.py b/datalab/mlalpha/commands/__init__.py
@@ -13,5 +13,6 @@
 
 from __future__ import absolute_import
 
+from . import _ml
 from . import _mlalpha
 from . import _tensorboard
diff --git a/datalab/mlalpha/commands/_ml.py b/datalab/mlalpha/commands/_ml.py
@@ -0,0 +1,160 @@
+# Copyright 2017 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+
+try:
+  import IPython
+  import IPython.core.magic
+except ImportError:
+  raise Exception('This module can only be loaded in ipython.')
+
+import collections
+import os
+import yaml
+
+import datalab.context
+import datalab.mlalpha
+import datalab.utils.commands
+
+
+@IPython.core.magic.register_line_cell_magic
+def ml(line, cell=None):
+  """Implements the ml line cell magic.
+
+  Args:
+    line: the contents of the ml line.
+    cell: the contents of the ml cell.
+
+  Returns:
+    The results of executing the cell.
+  """
+  parser = datalab.utils.commands.CommandParser(prog="ml", description="""
+Execute various ml-related operations. Use "%%ml <command> -h" for help on a specific command.
+""")
+  preprocess_parser = parser.subcommand('preprocess', 'Run a preprocess job.')
+  preprocess_parser.add_argument('--usage',
+                                 help='Show usage from the specified preprocess package.',
+                                 action='store_true', default=False)
+  preprocess_parser.add_argument('--cloud',
+                                 help='Whether to run the preprocessing job in the cloud.',
+                                 action='store_true', default=False)
+  preprocess_parser.add_argument('--package',
+                                 help='The preprocess package to use. Can be a gs or local path.',
+                                 required=True)
+  preprocess_parser.set_defaults(func=_preprocess)
+
+  train_parser = parser.subcommand('train', 'Train an ML model.')
+  train_parser.add_argument('--usage',
+                            help='Show usage from the specified trainer package',
+                            action='store_true', default=False)
+  train_parser.add_argument('--cloud',
+                            help='Whether to run the training job in the cloud.',
+                            action='store_true', default=False)
+  train_parser.add_argument('--package',
+                            help='The trainer package to use. Can be a gs or local path.',
+                            required=True)
+  train_parser.set_defaults(func=_train)
+
+  predict_parser = parser.subcommand('predict', 'Predict with an ML model.')
+  predict_parser.add_argument('--usage',
+                              help='Show usage from the specified prediction package',
+                              action='store_true', default=False)
+  predict_parser.add_argument('--cloud',
+                              help='Whether to run prediction in the cloud.',
+                              action='store_true', default=False)
+  predict_parser.add_argument('--package',
+                              help='The prediction package to use. Can be a gs or local path.',
+                              required=True)
+  predict_parser.set_defaults(func=_predict)
+
+  batch_predict_parser = parser.subcommand('batch_predict', 'Batch predict with an ML model.')
+  batch_predict_parser.add_argument('--usage',
+                                    help='Show usage from the specified prediction package',
+                                    action='store_true', default=False)
+  batch_predict_parser.add_argument('--cloud',
+                                    help='Whether to run prediction in the cloud.',
+                                    action='store_true', default=False)
+  batch_predict_parser.add_argument('--package',
+                                    help='The prediction package to use. Can be a gs or local path.',
+                                    required=True)
+  batch_predict_parser.set_defaults(func=_batch_predict)
+
+  namespace = datalab.utils.commands.notebook_environment()
+  return datalab.utils.commands.handle_magic_line(line, cell, parser, namespace=namespace)
+
+
+def _command_template(pr, func_name):
+  """Return (args_list, docstring).
+     args_list is in the form of:
+       arg1:
+       arg2:
+       arg3: (optional)
+  """
+  argspec, docstring = pr.get_func_args_and_docstring(func_name)
+  num_defaults = len(argspec.defaults) if argspec.defaults is not None else 0
+  # Need to fill in a keyword (here '(NOT_OP)') for non optional args.
+  # Later we will replace '(NOT_OP)' with empty string.
+  optionals = ['(NOT_OP)'] * (len(argspec.args) - num_defaults) + \
+      ['(optional)'] * num_defaults
+  args = dict(zip(argspec.args, optionals))
+  args_dump = yaml.safe_dump(args, default_flow_style=False).replace('(NOT_OP)', '')
+  return args_dump, docstring
+
+
+def _run_package(args, cell, mode):
+  local_func_name = 'local_' + mode
+  cloud_func_name = 'cloud_' + mode
+  with datalab.mlalpha.PackageRunner(args['package']) as pr:
+    if args['usage'] is True:
+      #TODO Consider calling _command_template once to save one pip installation
+      command_local = """%%ml %s --package %s""" % (mode, args['package'])
+      args_local, docstring_local = _command_template(pr, local_func_name)
+      command_cloud = """%%ml %s --package %s --cloud""" % (mode, args['package'])
+      args_cloud, docstring_cloud = _command_template(pr, cloud_func_name)
+      output = """
+Local Run Command:
+
+%s
+%s
+[Description]:
+%s
+
+Cloud Run Command:
+
+%s
+%s
+[Description]:
+%s
+""" % (command_local, args_local, docstring_local, command_cloud, args_cloud, docstring_cloud)
+      return datalab.utils.commands.render_text(output, preformatted=True)
+
+    env = datalab.utils.commands.notebook_environment()
+    func_args = datalab.utils.commands.parse_config(cell, env)
+    if args['cloud'] is True:
+      return pr.run_func(cloud_func_name, func_args)
+    else:
+      return pr.run_func(local_func_name, func_args)
+
+
+def _preprocess(args, cell):
+  return _run_package(args, cell, 'preprocess')
+
+
+def _train(args, cell):
+  return _run_package(args, cell, 'train')
+
+
+def _predict(args, cell):
+  return _run_package(args, cell, 'predict')
+
+
+def _batch_predict(args, cell):
+  return _run_package(args, cell, 'batch_predict')
diff --git a/datalab/mlalpha/commands/_mlalpha.py b/datalab/mlalpha/commands/_mlalpha.py
@@ -29,6 +29,7 @@
 import urllib
 import yaml
 
+import datalab.bigquery as bq
 import datalab.context
 import datalab.data
 import datalab.mlalpha
@@ -159,16 +160,19 @@ def mlalpha(line, cell=None):
   package_parser.add_argument('--output', help='the output dir of the package.', required=True)
   package_parser.set_defaults(func=_package)
 
-  package_parser = parser.subcommand('feature-slice-view','View results of a ' +
+  feature_slice_parser = parser.subcommand('feature-slice-view','View results of a ' +
                                      'FeatureSlicingPipeline, some eval metrics grouped by ' +
                                      'specified feature column values')
-  package_parser.add_argument('--file', help='The results file from FeatureSlicingPipeline',
-                              required=True)
-  package_parser.add_argument('--feature',
-                              help='Which feature to view. The feature must be specified ' +
-                                   'in the FeatureSlicingPipeline. If not specified, all ' +
-                                   'features will be listed.')
-  package_parser.set_defaults(func=_feature_slice_view)
+  feature_slice_parser.add_argument('--file', help='The results file from FeatureSlicingPipeline')
+  feature_slice_parser.add_argument('--sql',
+                                    help='The sql module which should return "feature",' +
+                                         '"count" columns, plus at least one metric column ' +
+                                         'with any names')
+  feature_slice_parser.add_argument('--feature',
+                                    help='Which feature to view. The feature must be specified ' +
+                                    'in the FeatureSlicingPipeline. If not specified, all ' +
+                                    'features will be listed.')
+  feature_slice_parser.set_defaults(func=_feature_slice_view)
 
   namespace = datalab.utils.commands.notebook_environment()
   return datalab.utils.commands.handle_magic_line(line, cell, parser, namespace=namespace)
@@ -986,6 +990,20 @@ def _package(args, cell):
   print 'Package created at %s.' % dest
 
 
+def _get_lantern_format(df):
+  if ('count' not in df) or ('feature' not in df):
+    raise Exception('No "count" or "feature" found in data.')
+  metric_names = list(set(df) - set(['feature']))
+  data = []
+  for ii, row in df.iterrows():
+    metric_values = dict(row)
+    metric_values['totalWeightedExamples'] = metric_values['count']
+    del metric_values['feature']
+    del metric_values['count']
+    data.append({'feature': row['feature'], 'metricValues': metric_values})
+  return data
+
+
 def _feature_slice_view(args, cell):
   HTML_TEMPLATE = """<link rel="import" href="/nbextensions/gcpdatalab/extern/lantern-browser.html" >
 <lantern-browser id="%s"></lantern-browser>
@@ -997,10 +1015,17 @@ def _feature_slice_view(args, cell):
 browser.weightedExamplesColumn = 'totalWeightedExamples';
 browser.calibrationPlotUriFn = function(s) { return '/' + s; }
 </script>"""
-  with open(args['file']) as f:
-    data = map(json.loads, f)
-  if args['feature']:
-    data = [e for e in data if e['feature'].split(':')[0] == args['feature']]
+  if args['sql'] is not None:
+    item = datalab.utils.commands.get_notebook_item(args['sql'])
+    item, _ = datalab.data.SqlModule.get_sql_statement_with_environment(item, {})
+    query = datalab.bigquery.Query(item)
+    df = query.results().to_dataframe()
+    data = _get_lantern_format(df)
+  elif args['dataframe'] is not None:
+    item = datalab.utils.commands.get_notebook_item(args['dataframe'])
+    data = _get_lantern_format(item)
+  else:
+    raise Exception('either --sql or --dataframe is needed.')
   metrics_str = str(map(str, data[0]['metricValues'].keys()))
   data_str = str([{str(k): json.dumps(v) for k,v in elem.iteritems()} for elem in data])
   html_id = 'l' + datalab.utils.commands.Html.next_id()

diff --git a/solutionbox/inception/datalab_solutions/inception/__init__.py b/solutionbox/inception/datalab_solutions/inception/__init__.py
@@ -11,4 +11,5 @@
 # the License.
 
 
-from ._package import local_preprocess, cloud_preprocess, local_train, cloud_train, local_predict, cloud_predict
+from ._package import local_preprocess, cloud_preprocess, local_train, cloud_train, local_predict, \
+                      cloud_predict, local_batch_predict, cloud_batch_predict