diff --git a/datalab/mlalpha/__init__.py b/datalab/ml/__init__.py
similarity index 61%
rename from datalab/mlalpha/__init__.py
rename to datalab/ml/__init__.py
index 9dc5b2b88..99bc309a7 100644
--- a/datalab/mlalpha/__init__.py
+++ b/datalab/ml/__init__.py
@@ -14,20 +14,14 @@
from __future__ import absolute_import
-from ._local_runner import LocalRunner
-from ._cloud_runner import CloudRunner
-from ._metadata import Metadata
-from ._local_predictor import LocalPredictor
-from ._cloud_predictor import CloudPredictor
-from ._job import Jobs
+from ._job import Jobs, Job
from ._summary import Summary
-from ._tensorboard import TensorBoardManager
-from ._dataset import DataSet
-from ._package import Packager
-from ._cloud_models import CloudModels, CloudModelVersions
+from ._tensorboard import TensorBoard
+from ._dataset import CsvDataSet, BigQueryDataSet
+from ._cloud_models import Models, ModelVersions
from ._confusion_matrix import ConfusionMatrix
+from ._feature_slice_view import FeatureSliceView
+from ._cloud_training_config import CloudTrainingConfig
+from ._util import *
-from plotly.offline import init_notebook_mode
-
-init_notebook_mode()
diff --git a/datalab/ml/_cloud_models.py b/datalab/ml/_cloud_models.py
new file mode 100644
index 000000000..5e5098e0a
--- /dev/null
+++ b/datalab/ml/_cloud_models.py
@@ -0,0 +1,274 @@
+# Copyright 2016 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+
+"""Implements Cloud ML Model Operations"""
+
+from googleapiclient import discovery
+import os
+import yaml
+
+import datalab.context
+import datalab.storage
+import datalab.utils
+
+from . import _util
+
+class Models(object):
+ """Represents a list of Cloud ML models for a project."""
+
+ def __init__(self, project_id=None):
+ """
+ Args:
+ project_id: project_id of the models. If not provided, default project_id will be used.
+ """
+ if project_id is None:
+ project_id = datalab.context.Context.default().project_id
+ self._project_id = project_id
+ self._credentials = datalab.context.Context.default().credentials
+ self._api = discovery.build('ml', 'v1', credentials=self._credentials)
+
+ def _retrieve_models(self, page_token, page_size):
+ list_info = self._api.projects().models().list(
+ parent='projects/' + self._project_id, pageToken=page_token, pageSize=page_size).execute()
+ models = list_info.get('models', [])
+ page_token = list_info.get('nextPageToken', None)
+ return models, page_token
+
+ def get_iterator(self):
+ """Get iterator of models so it can be used as "for model in Models().get_iterator()".
+ """
+ return iter(datalab.utils.Iterator(self._retrieve_models))
+
+ def get_model_details(self, model_name):
+ """Get details of the specified model from CloudML Service.
+
+ Args:
+ model_name: the name of the model. It can be a model full name
+ ("projects/[project_id]/models/[model_name]") or just [model_name].
+ Returns: a dictionary of the model details.
+ """
+ full_name = model_name
+ if not model_name.startswith('projects/'):
+ full_name = ('projects/%s/models/%s' % (self._project_id, model_name))
+ return self._api.projects().models().get(name=full_name).execute()
+
+ def create(self, model_name):
+ """Create a model.
+
+ Args:
+ model_name: the short name of the model, such as "iris".
+ Returns:
+ If successful, returns informaiton of the model, such as
+ {u'regions': [u'us-central1'], u'name': u'projects/myproject/models/mymodel'}
+ Raises:
+ If the model creation failed.
+ """
+ body = {'name': model_name}
+ parent = 'projects/' + self._project_id
+ # Model creation is instant. If anything goes wrong, Exception will be thrown.
+ return self._api.projects().models().create(body=body, parent=parent).execute()
+
+ def delete(self, model_name):
+ """Delete a model.
+
+ Args:
+ model_name: the name of the model. It can be a model full name
+ ("projects/[project_id]/models/[model_name]") or just [model_name].
+ """
+ full_name = model_name
+ if not model_name.startswith('projects/'):
+ full_name = ('projects/%s/models/%s' % (self._project_id, model_name))
+ response = self._api.projects().models().delete(name=full_name).execute()
+ if 'name' not in response:
+ raise Exception('Invalid response from service. "name" is not found.')
+ _util.wait_for_long_running_operation(response['name'])
+
+ def list(self, count=10):
+ """List models under the current project in a table view.
+
+ Args:
+ count: upper limit of the number of models to list.
+ Raises:
+ Exception if it is called in a non-IPython environment.
+ """
+ import IPython
+ data = []
+ # Add range(count) to loop so it will stop either it reaches count, or iteration
+ # on self is exhausted. "self" is iterable (see __iter__() method).
+ for _, model in zip(range(count), self):
+ element = {'name': model['name']}
+ if 'defaultVersion' in model:
+ version_short_name = model['defaultVersion']['name'].split('/')[-1]
+ element['defaultVersion'] = version_short_name
+ data.append(element)
+
+ IPython.display.display(
+ datalab.utils.commands.render_dictionary(data, ['name', 'defaultVersion']))
+
+ def describe(self, model_name):
+ """Print information of a specified model.
+
+ Args:
+ model_name: the name of the model to print details on.
+ """
+ model_yaml = yaml.safe_dump(self.get_model_details(model_name), default_flow_style=False)
+ print model_yaml
+
+
+class ModelVersions(object):
+ """Represents a list of versions for a Cloud ML model."""
+
+ def __init__(self, model_name, project_id=None):
+ """
+ Args:
+ model_name: the name of the model. It can be a model full name
+ ("projects/[project_id]/models/[model_name]") or just [model_name].
+ project_id: project_id of the models. If not provided and model_name is not a full name
+ (not including project_id), default project_id will be used.
+ """
+ if project_id is None:
+ self._project_id = datalab.context.Context.default().project_id
+ self._credentials = datalab.context.Context.default().credentials
+ self._api = discovery.build('ml', 'v1', credentials=self._credentials)
+ if not model_name.startswith('projects/'):
+ model_name = ('projects/%s/models/%s' % (self._project_id, model_name))
+ self._full_model_name = model_name
+ self._model_name = self._full_model_name.split('/')[-1]
+
+ def _retrieve_versions(self, page_token, page_size):
+ parent = self._full_model_name
+ list_info = self._api.projects().models().versions().list(parent=parent,
+ pageToken=page_token, pageSize=page_size).execute()
+ versions = list_info.get('versions', [])
+ page_token = list_info.get('nextPageToken', None)
+ return versions, page_token
+
+ def get_iterator(self):
+ """Get iterator of versions so it can be used as
+ "for v in ModelVersions(model_name).get_iterator()".
+ """
+ return iter(datalab.utils.Iterator(self._retrieve_versions))
+
+ def get_version_details(self, version_name):
+ """Get details of a version.
+
+ Args:
+ version: the name of the version in short form, such as "v1".
+ Returns: a dictionary containing the version details.
+ """
+ name = ('%s/versions/%s' % (self._full_model_name, version_name))
+ return self._api.projects().models().versions().get(name=name).execute()
+
+ def deploy(self, version_name, path):
+ """Deploy a model version to the cloud.
+
+ Args:
+ version_name: the name of the version in short form, such as "v1".
+ path: the Google Cloud Storage path (gs://...) which contains the model files.
+
+ Raises: Exception if the path is invalid or does not contain expected files.
+ Exception if the service returns invalid response.
+ """
+ if not path.startswith('gs://'):
+ raise Exception('Invalid path. Only Google Cloud Storage path (gs://...) is accepted.')
+
+ # If there is no "export.meta" or"saved_model.pb" under path but there is
+ # path/model/export.meta or path/model/saved_model.pb, then append /model to the path.
+ if (not datalab.storage.Item.from_url(os.path.join(path, 'export.meta')).exists() and
+ not datalab.storage.Item.from_url(os.path.join(path, 'saved_model.pb')).exists()):
+ if (datalab.storage.Item.from_url(os.path.join(path, 'model', 'export.meta')).exists() or
+ datalab.storage.Item.from_url(os.path.join(path, 'model', 'saved_model.pb')).exists()):
+ path = os.path.join(path, 'model')
+ else:
+ print('Cannot find export.meta or saved_model.pb, but continue with deployment anyway.')
+
+ body = {'name': self._model_name}
+ parent = 'projects/' + self._project_id
+ try:
+ self._api.projects().models().create(body=body, parent=parent).execute()
+ except:
+ # Trying to create an already existing model gets an error. Ignore it.
+ pass
+ body = {
+ 'name': version_name,
+ 'deployment_uri': path,
+ 'runtime_version': '1.0',
+ }
+ response = self._api.projects().models().versions().create(body=body,
+ parent=self._full_model_name).execute()
+ if 'name' not in response:
+ raise Exception('Invalid response from service. "name" is not found.')
+ _util.wait_for_long_running_operation(response['name'])
+
+ def delete(self, version_name):
+ """Delete a version of model.
+
+ Args:
+ version_name: the name of the version in short form, such as "v1".
+ """
+ name = ('%s/versions/%s' % (self._full_model_name, version_name))
+ response = self._api.projects().models().versions().delete(name=name).execute()
+ if 'name' not in response:
+ raise Exception('Invalid response from service. "name" is not found.')
+ _util.wait_for_long_running_operation(response['name'])
+
+ def predict(self, version_name, data):
+ """Get prediction results from features instances.
+
+ Args:
+ version_name: the name of the version used for prediction.
+ data: typically a list of instance to be submitted for prediction. The format of the
+ instance depends on the model. For example, structured data model may require
+ a csv line for each instance.
+ Note that online prediction only works on models that take one placeholder value,
+ such as a string encoding a csv line.
+ Returns:
+ A list of prediction results for given instances. Each element is a dictionary representing
+ output mapping from the graph.
+ An example:
+ [{"predictions": 1, "score": [0.00078, 0.71406, 0.28515]},
+ {"predictions": 1, "score": [0.00244, 0.99634, 0.00121]}]
+ """
+ full_version_name = ('%s/versions/%s' % (self._full_model_name, version_name))
+ request = self._api.projects().predict(body={'instances': data},
+ name=full_version_name)
+ request.headers['user-agent'] = 'GoogleCloudDataLab/1.0'
+ result = request.execute()
+ if 'predictions' not in result:
+ raise Exception('Invalid response from service. Cannot find "predictions" in response.')
+
+ return result['predictions']
+
+ def describe(self, version_name):
+ """Print information of a specified model.
+
+ Args:
+ version: the name of the version in short form, such as "v1".
+ """
+ version_yaml = yaml.safe_dump(self.get_version_details(version_name),
+ default_flow_style=False)
+ print version_yaml
+
+ def list(self):
+ """List versions under the current model in a table view.
+
+ Raises:
+ Exception if it is called in a non-IPython environment.
+ """
+ import IPython
+
+ # "self" is iterable (see __iter__() method).
+ data = [{'name': version['name'].split()[-1],
+ 'deploymentUri': version['deploymentUri'], 'createTime': version['createTime']}
+ for version in self]
+ IPython.display.display(
+ datalab.utils.commands.render_dictionary(data, ['name', 'deploymentUri', 'createTime']))
diff --git a/datalab/ml/_cloud_training_config.py b/datalab/ml/_cloud_training_config.py
new file mode 100644
index 000000000..9fcfddb89
--- /dev/null
+++ b/datalab/ml/_cloud_training_config.py
@@ -0,0 +1,47 @@
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import namedtuple
+
+_CloudTrainingConfig = namedtuple("CloudConfig",
+ ['region', 'scale_tier', 'master_type', 'worker_type',
+ 'parameter_server_type', 'worker_count', 'parameter_server_count'])
+_CloudTrainingConfig.__new__.__defaults__ = ('BASIC', None, None, None, None, None)
+
+
+class CloudTrainingConfig(_CloudTrainingConfig):
+ """A config namedtuple containing cloud specific configurations for CloudML training.
+
+ Fields:
+ region: the region of the training job to be submitted. For example, "us-central1".
+ Run "gcloud compute regions list" to get a list of regions.
+ scale_tier: Specifies the machine types, the number of replicas for workers and
+ parameter servers. For example, "STANDARD_1". See
+ https://cloud.google.com/ml/reference/rest/v1beta1/projects.jobs#scaletier
+ for list of accepted values.
+ master_type: specifies the type of virtual machine to use for your training
+ job's master worker. Must set this value when scale_tier is set to CUSTOM.
+ See the link in "scale_tier".
+ worker_type: specifies the type of virtual machine to use for your training
+ job's worker nodes. Must set this value when scale_tier is set to CUSTOM.
+ parameter_server_type: specifies the type of virtual machine to use for your training
+ job's parameter server. Must set this value when scale_tier is set to CUSTOM.
+ worker_count: the number of worker replicas to use for the training job. Each
+ replica in the cluster will be of the type specified in "worker_type".
+ Must set this value when scale_tier is set to CUSTOM.
+ parameter_server_count: the number of parameter server replicas to use. Each
+ replica in the cluster will be of the type specified in "parameter_server_type".
+ Must set this value when scale_tier is set to CUSTOM.
+ """
+ pass
diff --git a/datalab/ml/_confusion_matrix.py b/datalab/ml/_confusion_matrix.py
new file mode 100644
index 000000000..c7043943d
--- /dev/null
+++ b/datalab/ml/_confusion_matrix.py
@@ -0,0 +1,111 @@
+# Copyright 2016 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+
+
+import numpy as np
+import json
+import matplotlib.pyplot as plt
+import pandas as pd
+from sklearn.metrics import confusion_matrix
+
+import datalab.bigquery as bq
+import datalab.data as data
+
+from . import _util
+
+
+class ConfusionMatrix(object):
+ """Represents a confusion matrix."""
+
+ def __init__(self, cm, labels):
+ """
+ Args:
+ cm: a 2-dimensional matrix with row index being target, column index being predicted,
+ and values being count.
+ labels: the labels whose order matches the row/column indexes.
+ """
+ self._cm = cm
+ self._labels = labels
+
+ @staticmethod
+ def from_csv(input_csv, headers=None, schema_file=None):
+ """Create a ConfusionMatrix from a csv file.
+ Args:
+ input_csv: Path to a Csv file (with no header). Can be local or GCS path.
+ headers: Csv headers. If present, it must include 'target' and 'predicted'.
+ schema_file: Path to a JSON file containing BigQuery schema. Used if "headers" is None.
+ If present, it must include 'target' and 'predicted' columns.
+ Returns:
+ A ConfusionMatrix that can be plotted.
+ Raises:
+ ValueError if both headers and schema_file are None, or it does not include 'target'
+ or 'predicted' columns.
+ """
+
+ if headers is not None:
+ names = headers
+ elif schema_file is not None:
+ with _util.open_local_or_gcs(schema_file, mode='r') as f:
+ schema = json.load(f)
+ names = [x['name'] for x in schema]
+ else:
+ raise ValueError('Either headers or schema_file is needed')
+ with _util.open_local_or_gcs(input_csv, mode='r') as f:
+ df = pd.read_csv(f, names=names)
+ if 'target' not in df or 'predicted' not in df:
+ raise ValueError('Cannot find "target" or "predicted" column')
+
+ labels = sorted(set(df['target']) | set(df['predicted']))
+ cm = confusion_matrix(df['target'], df['predicted'], labels=labels)
+ return ConfusionMatrix(cm, labels)
+
+ @staticmethod
+ def from_bigquery(sql):
+ """Create a ConfusionMatrix from a BigQuery table or query.
+ Args:
+ sql: Can be one of:
+ A SQL query string.
+ A SQL Query module defined with '%%sql --name [module_name]'.
+ A Bigquery table.
+ The query results or table must include "target", "predicted" columns.
+ Returns:
+ A ConfusionMatrix that can be plotted.
+ Raises:
+ ValueError if query results or table does not include 'target' or 'predicted' columns.
+ """
+
+ query, _ = data.SqlModule.get_sql_statement_with_environment(sql, {})
+ sql = ('SELECT target, predicted, count(*) as count FROM (%s) group by target, predicted'
+ % query.sql)
+ df = bq.Query(sql).results().to_dataframe()
+ labels = sorted(set(df['target']) | set(df['predicted']))
+ labels_count = len(labels)
+ df['target'] = [labels.index(x) for x in df['target']]
+ df['predicted'] = [labels.index(x) for x in df['predicted']]
+ cm = [[0]*labels_count for i in range(labels_count)]
+ for index, row in df.iterrows():
+ cm[row['target']][row['predicted']] = row['count']
+ return ConfusionMatrix(cm, labels)
+
+ def plot(self):
+ """Plot the confusion matrix."""
+
+ plt.imshow(self._cm, interpolation='nearest', cmap=plt.cm.Blues)
+ plt.title('Confusion matrix')
+ plt.colorbar()
+ tick_marks = np.arange(len(self._labels))
+ plt.xticks(tick_marks, self._labels, rotation=45)
+ plt.yticks(tick_marks, self._labels)
+ plt.tight_layout()
+ plt.ylabel('True label')
+ plt.xlabel('Predicted label')
+
diff --git a/datalab/ml/_dataset.py b/datalab/ml/_dataset.py
new file mode 100644
index 000000000..434382b50
--- /dev/null
+++ b/datalab/ml/_dataset.py
@@ -0,0 +1,187 @@
+# Copyright 2017 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+
+
+"""Implements DataSets that serve two purposes:
+
+1. Recommended way to pass data source to ML packages.
+2. All DataSets can be sampled into dataframe for analysis/visualization.
+"""
+
+import json
+import numpy as np
+import pandas as pd
+import random
+
+import datalab.bigquery as bq
+import datalab.data
+
+from . import _util
+
+
+class CsvDataSet(object):
+ """DataSet based on CSV files and schema."""
+
+ def __init__(self, file_pattern, schema=None, schema_file=None):
+ """
+ Args:
+ file_pattern: A list of CSV files. or a string. Can contain wildcards in
+ file names. Can be local or GCS path.
+ schema: A BigQuery schema object in the form of
+ [{'name': 'col1', 'type': 'STRING'},
+ {'name': 'col2', 'type': 'INTEGER'}]
+ or a single string in of the form 'col1:STRING,col2:INTEGER,col3:FLOAT'.
+ schema_file: A JSON serialized schema file. If schema is None, it will try to load from
+ schema_file if not None.
+ Raise:
+ ValueError if both schema and schema_file are None.
+ """
+ if schema is None and schema_file is None:
+ raise ValueError('schema and schema_file cannot both be None.')
+
+ if schema is not None:
+ if isinstance(schema, list):
+ self._schema = schema
+ else:
+ self._schema = []
+ for x in schema.split(','):
+ parts = x.split(':')
+ if len(parts) != 2:
+ raise ValueError('invalid schema string "%s"' % x)
+ self._schema.append({'name': parts[0].strip(), 'type': parts[1].strip()})
+ else:
+ with _util.open_local_or_gcs(schema_file, 'r') as f:
+ self._schema = json.load(f)
+
+ if isinstance(file_pattern, basestring):
+ file_pattern = [file_pattern]
+ self._input_files = file_pattern
+
+ self._glob_files = []
+
+
+ @property
+ def input_files(self):
+ """Returns the file list that was given to this class without globing files."""
+ return self._input_files
+
+ @property
+ def files(self):
+ if not self._glob_files:
+ for file in self._input_files:
+ # glob_files() returns unicode strings which doesn't make DataFlow happy. So str().
+ self._glob_files += [str(x) for x in _util.glob_files(file)]
+
+ return self._glob_files
+
+ @property
+ def schema(self):
+ return self._schema
+
+ def sample(self, n):
+ """ Samples data into a Pandas DataFrame.
+ Args:
+ n: number of sampled counts.
+ Returns:
+ A dataframe containing sampled data.
+ Raises:
+ Exception if n is larger than number of rows.
+ """
+ row_total_count = 0
+ row_counts = []
+ for file in self.files:
+ with _util.open_local_or_gcs(file, 'r') as f:
+ num_lines = sum(1 for line in f)
+ row_total_count += num_lines
+ row_counts.append(num_lines)
+
+ names = None
+ dtype = None
+ if self._schema:
+ _MAPPINGS = {
+ 'FLOAT': np.float64,
+ 'INTEGER': np.int64,
+ 'TIMESTAMP': np.datetime64,
+ 'BOOLEAN': np.bool,
+ }
+ names = [x['name'] for x in self._schema]
+ dtype = {x['name']: _MAPPINGS.get(x['type'], object) for x in self._schema}
+
+ skip_count = row_total_count - n
+ # Get all skipped indexes. These will be distributed into each file.
+ # Note that random.sample will raise Exception if skip_count is greater than rows count.
+ skip_all = sorted(random.sample(xrange(0, row_total_count), skip_count))
+ dfs = []
+ for file, row_count in zip(self.files, row_counts):
+ skip = [x for x in skip_all if x < row_count]
+ skip_all = [x - row_count for x in skip_all if x >= row_count]
+ with _util.open_local_or_gcs(file, 'r') as f:
+ dfs.append(pd.read_csv(f, skiprows=skip, names=names, dtype=dtype, header=None))
+ return pd.concat(dfs, axis=0, ignore_index=True)
+
+
+class BigQueryDataSet(object):
+ """DataSet based on BigQuery table or query."""
+
+ def __init__(self, sql=None, table=None):
+ """
+ Args:
+ sql: A SQL query string, or a SQL Query module defined with '%%sql --name [module_name]'
+ table: A table name in the form of "dataset:table".
+ Raises:
+ ValueError if both sql and table are set, or both are None.
+ """
+ if (sql is None and table is None) or (sql is not None and table is not None):
+ raise ValueError('One and only one of sql and table should be set.')
+
+ self._query = None
+ self._table = None
+ if sql is not None:
+ query, _ = datalab.data.SqlModule.get_sql_statement_with_environment(sql, {})
+ self._query = query.sql
+ if table is not None:
+ self._table = table
+ self._schema = None
+
+ @property
+ def query(self):
+ return self._query
+
+ @property
+ def table(self):
+ return self._table
+
+ @property
+ def schema(self):
+ if self._schema is None:
+ source = self._query or self._table
+ self._schema = bq.Query('SELECT * FROM (%s) LIMIT 1' % source).results().schema
+ return self._schema
+
+ def sample(self, n):
+ """Samples data into a Pandas DataFrame. Note that it calls BigQuery so it will
+ incur cost.
+ Args:
+ n: number of sampled counts. Note that the number of counts returned is approximated.
+ Returns:
+ A dataframe containing sampled data.
+ Raises:
+ Exception if n is larger than number of rows.
+ """
+ source = self._query or self._table
+ total = bq.Query('select count(*) from (%s)' % source).results()[0].values()[0]
+ if n > total:
+ raise ValueError('sample larger than population')
+ sampling = bq.Sampling.random(n*100.0/float(total))
+ sample = bq.Query(source).sample(sampling=sampling)
+ df = sample.to_dataframe()
+ return df
diff --git a/datalab/ml/_feature_slice_view.py b/datalab/ml/_feature_slice_view.py
new file mode 100644
index 000000000..474b7843e
--- /dev/null
+++ b/datalab/ml/_feature_slice_view.py
@@ -0,0 +1,87 @@
+# Copyright 2017 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+
+import json
+import pandas as pd
+from types import ModuleType
+
+import datalab.data
+import datalab.utils
+
+
+class FeatureSliceView(object):
+ """Represents A feature slice view."""
+
+ def _get_lantern_format(self, df):
+ """ Feature slice view browser expects data in the format of:
+ {"metricValues": {"count": 12, "accuracy": 1.0}, "feature": "species:Iris-setosa"}
+ {"metricValues": {"count": 11, "accuracy": 0.72}, "feature": "species:Iris-versicolor"}
+ ...
+ This function converts a DataFrame to such format.
+ """
+
+ if ('count' not in df) or ('feature' not in df):
+ raise Exception('No "count" or "feature" found in data.')
+ if len(df.columns) < 3:
+ raise Exception('Need at least one metrics column.')
+ if len(df) == 0:
+ raise Exception('Data is empty')
+
+ data = []
+ for _, row in df.iterrows():
+ metric_values = dict(row)
+ feature = metric_values.pop('feature')
+ data.append({'feature': feature, 'metricValues': metric_values})
+ return data
+
+ def plot(self, data):
+ """ Plots a featire slice view on given data.
+
+ Args:
+ data: Can be one of:
+ A string of sql query.
+ A sql query module defined by "%%sql --module module_name".
+ A pandas DataFrame.
+ Regardless of data type, it must include the following columns:
+ "feature": identifies a slice of features. For example: "petal_length:4.0-4.2".
+ "count": number of instances in that slice of features.
+ All other columns are viewed as metrics for its feature slice. At least one is required.
+ """
+ import IPython
+
+ if isinstance(data, ModuleType) or isinstance(data, basestring):
+ item, _ = datalab.data.SqlModule.get_sql_statement_with_environment(data, {})
+ query = datalab.bigquery.Query(item)
+ df = query.results().to_dataframe()
+ data = self._get_lantern_format(df)
+ elif isinstance(data, pd.core.frame.DataFrame):
+ data = self._get_lantern_format(data)
+ else:
+ raise Exception('data needs to be a sql query, or a pandas DataFrame.')
+
+ HTML_TEMPLATE = """
+
TensorBoard was started successfully with pid %d. ' % p.pid + html += 'Click here to access it.
' % url + IPython.display.display_html(html, raw=True) + return time.sleep(1) retry -= 1 diff --git a/datalab/ml/_util.py b/datalab/ml/_util.py new file mode 100644 index 000000000..5db1c8245 --- /dev/null +++ b/datalab/ml/_util.py @@ -0,0 +1,109 @@ +# Copyright 2017 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from apache_beam.io import gcsio +import datetime +import glob +from googleapiclient import discovery +import os +import shutil +import subprocess +import tempfile +import time + +import datalab.context + +# TODO: Create an Operation class. +def wait_for_long_running_operation(operation_full_name): + print('Waiting for operation "%s"' % operation_full_name) + api = discovery.build('ml', 'v1', credentials=datalab.context.Context.default().credentials) + while True: + response = api.projects().operations().get(name=operation_full_name).execute() + if 'done' not in response or response['done'] != True: + time.sleep(3) + else: + if 'error' in response: + print(response['error']) + else: + print('Done.') + break + + +def package_and_copy(package_root_dir, setup_py, output_tar_path): + """Repackage an CloudML package and copy it to a staging dir. + + Args: + package_root_dir: the root dir to install package from. Usually you can get the path + from inside your module using a relative path to __file__. + setup_py: the path to setup.py. + output_tar_path: the GCS path of the output tarball package. + Raises: + ValueError if output_tar_path is not a GCS path, or setup_py does not exist. + """ + if not output_tar_path.startswith('gs://'): + raise ValueError('output_tar_path needs to be a GCS path.') + if not os.path.isfile(setup_py): + raise ValueError('Supplied file "%s" does not exist.' % setup_py) + + dest_setup_py = os.path.join(package_root_dir, 'setup.py') + # setuptools requires a "setup.py" in the current dir, so copy setup.py there. + # Also check if there is an existing setup.py. If so, back it up. + if os.path.isfile(dest_setup_py): + os.rename(dest_setup_py, dest_setup_py + '._bak_') + shutil.copyfile(setup_py, dest_setup_py) + + tempdir = tempfile.mkdtemp() + previous_cwd = os.getcwd() + os.chdir(package_root_dir) + try: + # Repackage. + sdist = ['python', dest_setup_py, 'sdist', '--format=gztar', '-d', tempdir] + subprocess.check_call(sdist) + + # Copy to GCS. + source = os.path.join(tempdir, '*.tar.gz') + gscopy = ['gsutil', 'cp', source, output_tar_path] + subprocess.check_call(gscopy) + return + finally: + os.chdir(previous_cwd) + os.remove(dest_setup_py) + if os.path.isfile(dest_setup_py + '._bak_'): + os.rename(dest_setup_py + '._bak_', dest_setup_py) + shutil.rmtree(tempdir) + + +def open_local_or_gcs(path, mode): + """Opens the given path.""" + + if path.startswith('gs://'): + try: + return gcsio.GcsIO().open(path, mode) + except Exception as e: # pylint: disable=broad-except + # Currently we retry exactly once, to work around flaky gcs calls. + logging.error('Retrying after exception reading gcs file: %s', e) + time.sleep(10) + return gcsio.GcsIO().open(path, mode) + else: + return open(path, mode) + + +def glob_files(path): + """Glob the given path.""" + + if path.startswith('gs://'): + return gcsio.GcsIO().glob(path) + else: + return glob.glob(path) diff --git a/datalab/mlalpha/_cloud_models.py b/datalab/mlalpha/_cloud_models.py deleted file mode 100644 index 645024415..000000000 --- a/datalab/mlalpha/_cloud_models.py +++ /dev/null @@ -1,201 +0,0 @@ -# Copyright 2016 Google Inc. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. - -"""Implements Cloud ML Model Operations""" - -from googleapiclient import discovery -import os -import time - -import datalab.context -import datalab.storage -import datalab.utils - - -# TODO(qimingj) Remove once the API is public since it will no longer be needed -_CLOUDML_DISCOVERY_URL = 'https://storage.googleapis.com/cloud-ml/discovery/' \ - 'ml_v1beta1_discovery.json' - - -class CloudModels(object): - """Represents a list of Cloud ML models for a project.""" - - def __init__(self, project_id=None, credentials=None, api=None): - """Initializes an instance of a CloudML Model list that is iteratable - ("for model in CloudModels()"). - - Args: - project_id: project_id of the models. If not provided, default project_id will be used. - credentials: credentials used to talk to CloudML service. If not provided, default credentials - will be used. - api: an optional CloudML API client. - """ - if project_id is None: - project_id = datalab.context.Context.default().project_id - self._project_id = project_id - if credentials is None: - credentials = datalab.context.Context.default().credentials - self._credentials = credentials - if api is None: - api = discovery.build('ml', 'v1beta1', credentials=self._credentials, - discoveryServiceUrl=_CLOUDML_DISCOVERY_URL) - self._api = api - - def _retrieve_models(self, page_token, page_size): - list_info = self._api.projects().models().list(parent='projects/' + self._project_id, - pageToken=page_token, pageSize=page_size).execute() - models = list_info.get('models', []) - page_token = list_info.get('nextPageToken', None) - return models, page_token - - def __iter__(self): - return iter(datalab.utils.Iterator(self._retrieve_models)) - - def get(self, model_name): - """Get details of a model. - - Args: - model_name: the name of the model. It can be a model full name - ("projects/[project_id]/models/[model_name]") or just [model_name]. - Returns: a dictionary of the model details. - """ - full_name = model_name - if not model_name.startswith('projects/'): - full_name = ('projects/%s/models/%s' % (self._project_id, model_name)) - return self._api.projects().models().get(name=full_name).execute() - - def create(self, model_name): - """Create a model. - - Args: - model_name: the short name of the model, such as "iris". - """ - body = {'name': model_name} - parent = 'projects/' + self._project_id - self._api.projects().models().create(body=body, parent=parent).execute() - - def delete(self, model_name): - """Delete a model. - - Args: - model_name: the name of the model. It can be a model full name - ("projects/[project_id]/models/[model_name]") or just [model_name]. - """ - full_name = model_name - if not model_name.startswith('projects/'): - full_name = ('projects/%s/models/%s' % (self._project_id, model_name)) - return self._api.projects().models().delete(name=full_name).execute() - - -class CloudModelVersions(object): - """Represents a list of versions for a Cloud ML model.""" - - def __init__(self, model_name, project_id=None, credentials=None, api=None): - """Initializes an instance of a CloudML model version list that is iteratable - ("for version in CloudModelVersions()"). - - Args: - model_name: the name of the model. It can be a model full name - ("projects/[project_id]/models/[model_name]") or just [model_name]. - project_id: project_id of the models. If not provided and model_name is not a full name - (not including project_id), default project_id will be used. - credentials: credentials used to talk to CloudML service. If not provided, default - credentials will be used. - api: an optional CloudML API client. - """ - if project_id is None: - project_id = datalab.context.Context.default().project_id - self._project_id = project_id - if credentials is None: - credentials = datalab.context.Context.default().credentials - self._credentials = credentials - if api is None: - api = discovery.build('ml', 'v1alpha3', credentials=self._credentials, - discoveryServiceUrl=_CLOUDML_DISCOVERY_URL) - self._api = api - if not model_name.startswith('projects/'): - model_name = ('projects/%s/models/%s' % (self._project_id, model_name)) - self._full_model_name = model_name - self._model_name = self._full_model_name.split('/')[-1] - - def _retrieve_versions(self, page_token, page_size): - parent = self._full_model_name - list_info = self._api.projects().models().versions().list(parent=parent, - pageToken=page_token, pageSize=page_size).execute() - versions = list_info.get('versions', []) - page_token = list_info.get('nextPageToken', None) - return versions, page_token - - def __iter__(self): - return iter(datalab.utils.Iterator(self._retrieve_versions)) - - def get(self, version_name): - """Get details of a version. - - Args: - version: the name of the version in short form, such as "v1". - Returns: a dictionary containing the version details. - """ - name = ('%s/versions/%s' % (self._full_model_name, version_name)) - return self._api.projects().models().versions().get(name=name).execute() - - def _wait_for_long_running_operation(self, response): - if 'name' not in response: - raise Exception('Invaid response from service. Cannot find "name" field.') - while True: - response = self._api.projects().operations().get(name=response['name']).execute() - if 'done' not in response or response['done'] != True: - time.sleep(3) - else: - if 'error' in response: - print response['error'] - break - - def deploy(self, version_name, path): - """Deploy a model version to the cloud. - - Args: - version_name: the name of the version in short form, such as "v1". - path: the Google Cloud Storage path (gs://...) which contains the model files. - - Raises: Exception if the path is invalid or does not contain expected files. - Exception if the service returns invalid response. - """ - if not path.startswith('gs://'): - raise Exception('Invalid path. Only Google Cloud Storage path (gs://...) is accepted.') - if not datalab.storage.Item.from_url(os.path.join(path, 'export.meta')).exists(): - raise Exception('Cannot find export.meta from given path.') - - body = {'name': self._model_name} - parent = 'projects/' + self._project_id - try: - self._api.projects().models().create(body=body, parent=parent).execute() - except: - # Trying to create an already existing model gets an error. Ignore it. - pass - body = { - 'name': version_name, - 'deployment_uri': path, - } - response = self._api.projects().models().versions().create(body=body, - parent=self._full_model_name).execute() - self._wait_for_long_running_operation(response) - - def delete(self, version_name): - """Delete a version of model. - - Args: - version_name: the name of the version in short form, such as "v1". - """ - name = ('%s/versions/%s' % (self._full_model_name, version_name)) - response = self._api.projects().models().versions().delete(name=name).execute() - self._wait_for_long_running_operation(response) diff --git a/datalab/mlalpha/_cloud_predictor.py b/datalab/mlalpha/_cloud_predictor.py deleted file mode 100644 index 8209d77a1..000000000 --- a/datalab/mlalpha/_cloud_predictor.py +++ /dev/null @@ -1,89 +0,0 @@ -# Copyright 2016 Google Inc. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. - - -from googleapiclient import discovery -import pandas as pd - -import datalab.context -import datalab.utils - - -# TODO(qimingj) Remove once the API is public since it will no longer be needed -_CLOUDML_DISCOVERY_URL = 'https://storage.googleapis.com/cloud-ml/discovery/' \ - 'ml_v1beta1_discovery.json' - - -class CloudPredictor(object): - """Preforms cloud predictions on given data.""" - - # TODO: Either remove label_output, or add code to load metadata from model dir and - # transform integer to label. Depending on whether online prediction returns label or not. - def __init__(self, model_name, version_name, label_output=None, - project_id=None, credentials=None, api=None): - """Initializes an instance of a CloudPredictor. - - Args: - model_name: the name of the model used for prediction. - version_name: the name of the version used for prediction. - label_output: the name of the output column where all values should be converted from - index to labels. Only useful in classification. If specified, metadata_path is required. - project_id: project_id of the model. If not provided, default project_id will be used. - credentials: credentials used to talk to CloudML service. If not provided, default - credentials will be used. - api: an optional CloudML API client. - """ - self._model_name = model_name - self._version_name = version_name - if project_id is None: - project_id = datalab.context.Context.default().project_id - self._project_id = project_id - if credentials is None: - credentials = datalab.context.Context.default().credentials - self._credentials = credentials - if api is None: - api = discovery.build('ml', 'v1beta1', credentials=self._credentials, - discoveryServiceUrl=_CLOUDML_DISCOVERY_URL) - self._api = api - self._full_version_name = ('projects/%s/models/%s/versions/%s' % - (self._project_id, self._model_name, self._version_name)) - - def predict(self, data): - """Make predictions on given data. - - Args: - data: a list of feature data or a pandas DataFrame. Each element in the list is an instance - which is a dictionary of feature data. - An example: - [{"sepal_length": 4.9, "sepal_width": 2.5, "petal_length": 4.5, "petal_width": 1.7}, - {"sepal_length": 5.7, "sepal_width": 2.8, "petal_length": 4.1, "petal_width": 1.3}] - Returns: - A list of prediction results for given instances. Each element is a dictionary representing - output mapping from the graph. - An example: - [{"predictions": 1, "score": [0.00078, 0.71406, 0.28515]}, - {"predictions": 1, "score": [0.00244, 0.99634, 0.00121]}] - - Raises: Exception if bad response is received from the service - Exception if the prediction result has incorrect label types - """ - if isinstance(data, pd.DataFrame): - data = data.T.to_dict().values() - - request = self._api.projects().predict(body={'instances': data}, - name=self._full_version_name) - request.headers['user-agent'] = 'GoogleCloudDataLab/1.0' - result = request.execute() - if 'predictions' not in result: - raise Exception('Invalid response from service. Cannot find "predictions" in response.') - - return result['predictions'] diff --git a/datalab/mlalpha/_cloud_runner.py b/datalab/mlalpha/_cloud_runner.py deleted file mode 100644 index 5da4958d8..000000000 --- a/datalab/mlalpha/_cloud_runner.py +++ /dev/null @@ -1,90 +0,0 @@ -# Copyright 2016 Google Inc. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. - -import datetime -from googleapiclient import discovery - -import datalab.context - - -# TODO(qimingj) Remove once the API is public since it will no longer be needed -_CLOUDML_DISCOVERY_URL = 'https://storage.googleapis.com/cloud-ml/discovery/' \ - 'ml_v1beta1_discovery.json' - - -class CloudRunner(object): - """CloudML Trainer API Wrapper that takes a job_request, add authentication information, - submit it to cloud, and get job response. - """ - - def __init__(self, job_request): - """Initializes an instance of a LocalRunner - - Args: - job_request: the arguments of the training job in a dict. For example, - { - 'package_uris': 'gs://my-bucket/iris/trainer-0.1.tar.gz', - 'python_module': 'trainer.task', - 'scale_tier': 'BASIC', - 'region': 'us-central1', - 'args': { - 'train_data_paths': ['gs://mubucket/data/features_train'], - 'eval_data_paths': ['gs://mubucket/data/features_eval'], - 'metadata_path': 'gs://mubucket/data/metadata.yaml', - 'output_path': 'gs://mubucket/data/mymodel/', - } - } - """ - - self._job_request = dict(job_request) - # convert job_args from dict to list as service required. - if 'args' in job_request and isinstance(job_request['args'], dict): - job_args = job_request['args'] - args = [] - for k,v in job_args.iteritems(): - if isinstance(v, list): - for item in v: - args.append('--' + k) - args.append(str(item)) - else: - args.append('--' + k) - args.append(str(v)) - self._job_request['args'] = args - - def _create_default_job_name(self): - job_name = datetime.datetime.now().strftime('%y%m%d_%H%M%S') - if 'python_module' in self._job_request: - job_name = self._job_request['python_module'].replace('.', '_') + \ - '_' + job_name - return job_name - - def run(self, job_id=None): - """Submit a training job to the CloudML service. - - Args: - job_id: id for the training job. If None, a UUID will be generated. - - Returns: job info returned from service. - """ - if job_id is None: - job_id = self._create_default_job_name() - job = { - 'job_id': job_id, - 'training_input': self._job_request, - } - context = datalab.context.Context.default() - cloudml = discovery.build('ml', 'v1beta1', credentials=context.credentials, - discoveryServiceUrl=_CLOUDML_DISCOVERY_URL) - request = cloudml.projects().jobs().create(body=job, - parent='projects/' + context.project_id) - request.headers['user-agent'] = 'GoogleCloudDataLab/1.0' - return request.execute() diff --git a/datalab/mlalpha/_confusion_matrix.py b/datalab/mlalpha/_confusion_matrix.py deleted file mode 100644 index 0519a3f05..000000000 --- a/datalab/mlalpha/_confusion_matrix.py +++ /dev/null @@ -1,67 +0,0 @@ -# Copyright 2016 Google Inc. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. - - -from plotly.offline import iplot - - -class ConfusionMatrix(object): - """Represents a confusion matrix.""" - - def __init__(self, predicted_labels, true_labels, counts): - """Initializes an instance of a ComfusionMatrix. the length of predicted_values, - true_values, count must be the same. - - Args: - predicted_labels: a list of predicted labels. - true_labels: a list of true labels. - counts: a list of count for each (predicted, true) combination. - - Raises: Exception if predicted_labels, true_labels, and counts are not of the same size - """ - if len(predicted_labels) != len(true_labels) or len(true_labels) != len(counts): - raise Exception('The input predicted_labels, true_labels, counts need to be same size.') - self._all_labels = list(set(predicted_labels) | set(true_labels)) - data = [] - for value in self._all_labels: - predicts_for_current_true_label = \ - {p: c for p, t, c in zip(predicted_labels, true_labels, counts) if t == value} - # sort by all_values and fill in zeros if needed - predicts_for_current_true_label = [predicts_for_current_true_label.get(v, 0) - for v in self._all_labels] - data.append(predicts_for_current_true_label) - self._data = data - - def plot(self): - """Plot the confusion matrix.""" - figure_data = \ - { - "data": [ - { - "x": self._all_labels, - "y": self._all_labels, - "z": self._data, - "colorscale": "YlGnBu", - "type": "heatmap" - } - ], - "layout": { - "title": "Confusion Matrix", - "xaxis": { - "title": "Predicted value", - }, - "yaxis": { - "title": "True Value", - } - } - } - iplot(figure_data) diff --git a/datalab/mlalpha/_dataset.py b/datalab/mlalpha/_dataset.py deleted file mode 100644 index e9b5b01ef..000000000 --- a/datalab/mlalpha/_dataset.py +++ /dev/null @@ -1,417 +0,0 @@ -# Copyright 2016 Google Inc. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. - - -import google.cloud.ml.features as features -import matplotlib.pyplot as plt -import numpy as np -import os -import pandas as pd -import pandas_profiling -from plotly.graph_objs import Histogram, Scatter, Scatter3d -from plotly.offline import iplot -from plotly import tools -import seaborn as sns -import tempfile - -import datalab.utils - -try: - import IPython.core.display -except ImportError: - raise Exception('This module can only be loaded in ipython.') - - -class DataSet(object): - """Represents a dataset that can be explored through its 'analyze()' function. - The data need to be able to fit in memory. - """ - - def __init__(self, feature_set, data_paths, format='csv'): - """Initializes an instance of DataSet. - - Args: - feature_set: A feature_set describing the data. The feature_set provides data types - (for example, csv), column names, schema, data transformers, etc. - This is the same class used in CloudML preprocessing. - data_paths: A dictionary with {name: path} pair. All data need to have the same schema. - format: the format of the data, currently only 'csv' or 'tsv'. - - Raises: Exception if data_paths is not a dictionary - Exception if the format is not csv or tsv - """ - self._feature_set = feature_set - if not isinstance(data_paths, dict): - raise Exception('Expect "data_paths" to be a dictionary.') - self._data_paths = data_paths - if format == 'csv': - self._delimiter = ',' - elif format=='tsv': - self._delimiter = '\t' - else: - raise Exception('Unsupported format "%s"' % format) - self._dataframes = {} - self._raw_dataframes = {} - self._concatenated_data_frame = None - self._concatenated_raw_data_frame = None - self._target_name = None - self._key_name = None - - def _get_dataframe_type(self, column): - if isinstance(column, features.NumericFeatureColumn): - return np.float64 - if isinstance(column, features.TargetFeatureColumn) and column.is_numeric: - return np.float64 - return str - - def _is_categorical_column(self, column): - if isinstance(column, features.CategoricalFeatureColumn): - return True - if isinstance(column, features.TargetFeatureColumn) and not column.is_numeric: - return True - return False - - def _transform_data(self, df): - df = df.copy(deep=True) - for name, value in type(self._feature_set).__dict__.iteritems(): - for column in (value if (type(value) == list or type(value) == tuple) else [value]): - if self._is_categorical_column(column): - concatenated_column = self._concatenated_raw_data_frame[column.name] - all_categories = concatenated_column.astype('category').cat.categories - df[column.name] = pd.Categorical(df[column.name], categories=all_categories) - if isinstance(column, features.NumericFeatureColumn): - concatenated_column = self._concatenated_raw_data_frame[column.name] - # Simulate metadata so we can create a transformer from CloudML features registry. - transform_info = { - 'type': 'numeric', - 'transform': column.transform, - } - transform_info[column.transform] = column.transform_args - transform_info['max'] = max(concatenated_column) - transform_info['min'] = min(concatenated_column) - transformer = features._registries.transformation_registry \ - .get_transformer(transform_info) - transformed = [transformer.transform(x)[0] for x in df[column.name]] - if column.transform == 'discretize': - # Transformed data contains a one_of_k list so need to convert it back to index. - # Categories needs to be num_of_buckets+2 to match the transformer behavior, - # where it creates a smaller-than-min and a greater-than-max buckets. - df[column.name] = pd.Series(pd.Categorical(transformed, - categories=range(transformer._buckets+2))) - else: - # TODO(qimingj): It is supposed to work with most transformers but still need to - # test them if new transformers become available. - df[column.name] = transformed - return df - - def _load_to_dataframes(self): - if self._concatenated_raw_data_frame is not None: - return # Already loaded. - - # Step 1: Get schema from feature_set class. - schema = {} - for name, value in type(self._feature_set).__dict__.iteritems(): - for column in (value if (type(value) == list or type(value) == tuple) else [value]): - if issubclass(type(column), features.FeatureColumn): - if isinstance(column, features.TargetFeatureColumn): - self._target_name = column.name - if isinstance(column, features.KeyFeatureColumn): - self._key_name = column.name - data_type = self._get_dataframe_type(column) - schema[column.name] = data_type - if self._target_name is None: - raise Exception('No target column found from feature_set') - - # Step 2: Load all non-text data into raw dataframes. - for name, data_path in self._data_paths.iteritems(): - local_file = data_path - if data_path.startswith('gs://'): - local_file = tempfile.mktemp() - datalab.utils.gcs_copy_file(data_path, local_file) - self._raw_dataframes[name] = pd.read_csv(local_file, - names=type(self._feature_set).csv_columns, - dtype=schema, - delimiter=self._delimiter, - skipinitialspace=True) - if data_path.startswith('gs://'): - os.remove(local_file) - self._concatenated_raw_data_frame = pd.concat(self._raw_dataframes.values()) - - # Step 3: Transform the data. - for name, raw_df in self._raw_dataframes.iteritems(): - self._dataframes[name] = self._transform_data(raw_df) - self._concatenated_data_frame = pd.concat(self._dataframes.values()) - - def _get_numeric_values(self, df, column_name): - if str(df[column_name].dtype) == 'category': - return df[column_name].cat.codes - else: - return df[column_name].values - - def _create_dummy_trace(self, x, y): - # Dummy trace is needed for scatter plot to a) set the same x and y ranges across multiple - # subplots, b) the categorical labels are sorted in the same way across multiple subplots - # (the order of the categories depend on the order they appear in the data). - # For a given axis, if it is categorical data, we draw one point for each category. - # If it is numeric data, we draw min and max. Usually on x and y axises we don't have same - # number of points, so we will pad one axis data. - # Note: This needs to go away if plotly python supports setting ranges and specifying - # category order across subplots. - if str(self._concatenated_data_frame[x].dtype) == 'category': - categories = self._concatenated_data_frame[x].cat.categories - x_dummy = list(categories) - else: - x_dummy = [min(self._concatenated_data_frame[x]), max(self._concatenated_data_frame[x])] - if str(self._concatenated_data_frame[y].dtype) == 'category': - categories = self._concatenated_data_frame[y].cat.categories - y_dummy = list(categories) - else: - y_dummy = [min(self._concatenated_data_frame[y]), max(self._concatenated_data_frame[y])] - if len(x_dummy) > len(y_dummy): - y_dummy = y_dummy + [y_dummy[-1]]*(len(x_dummy)-len(y_dummy)) - if len(x_dummy) < len(y_dummy): - x_dummy = x_dummy + [x_dummy[-1]]*(len(y_dummy)-len(x_dummy)) - - scatter_dummy = Scatter( - x=x_dummy, - y=y_dummy, - showlegend=False, - opacity=0, # Make it invisible. - hoverinfo='none', - ) - return scatter_dummy - - def _histogram(self, names, x): - concatenated_numeric_values = self._get_numeric_values(self._concatenated_data_frame, x) - start = min(concatenated_numeric_values) - end = max(concatenated_numeric_values) - size = 1 if str(self._concatenated_data_frame[x].dtype) == 'category' \ - else (max(concatenated_numeric_values) - min(concatenated_numeric_values)) / 10.0 - fig = tools.make_subplots(rows=1, cols=len(names), print_grid=False) - histogram_index = 1 - for name in names: - df = self._dataframes[name] - numeric_values = self._get_numeric_values(df, x) - text = df[x].cat.categories if str(df[x].dtype) == 'category' else None - histogram = Histogram( - name=name, - x=numeric_values, - xbins=dict( - start=start, - end=end, - size=size, - ), - text=text, - ) - fig.append_trace(histogram, 1, histogram_index) - fig.layout['xaxis' + str(histogram_index)].title = x - fig.layout['xaxis' + str(histogram_index)].range = [start, end] - fig.layout['yaxis' + str(histogram_index)].title = 'count' - histogram_index += 1 - fig.layout.width = min(500 * len(names), 1200) - fig.layout.height = 500 - iplot(fig) - - def _scatter_plot(self, names, x, y, color): - showscale = True if str(self._concatenated_data_frame[color].dtype) != 'category' else False - cmin = min(self._get_numeric_values(self._concatenated_data_frame, color)) - cmax = max(self._get_numeric_values(self._concatenated_data_frame, color)) - fig = tools.make_subplots(rows=1, cols=len(names), print_grid=False) - scatter_index = 1 - scatter_dummy = self._create_dummy_trace(x, y) - for name in names: - df = self._dataframes[name] - text = ["x=%s y=%s target=%s" % (str(a),str(b),str(t)) for a,b,t - in zip(df[x], df[y], df[color])] - scatter = Scatter( - name=name, - x=df[x], - y=df[y], - mode='markers', - text=text, - hoverinfo='text', - marker=dict( - color=self._get_numeric_values(df, color), - colorscale='Viridis', - showscale=showscale, - cmin=cmin, - cmax=cmax, - ) - ) - # Add dummy trace to set same ranges and categorical orders on subplots. - fig.append_trace(scatter_dummy, 1, scatter_index) - fig.append_trace(scatter, 1, scatter_index) - fig.layout['xaxis' + str(scatter_index)].title = x - fig.layout['yaxis' + str(scatter_index)].title = y - scatter_index += 1 - fig.layout.width = min(500 * len(names), 1200) - fig.layout.height = 500 - iplot(fig) - - def _scatter3d_plot(self, names, x, y, z, color): - showscale = True if str(self._concatenated_data_frame[color].dtype) != 'category' else False - cmin = min(self._get_numeric_values(self._concatenated_data_frame, color)) - cmax = max(self._get_numeric_values(self._concatenated_data_frame, color)) - specs = [[{'is_3d':True}]*len(self._dataframes)] - fig = tools.make_subplots(rows=1, cols=len(names), specs=specs, print_grid=False) - scatter3d_index = 1 - for name in names: - df = self._dataframes[name] - text = ["x=%s y=%s z=%s, target=%s" % (str(a),str(b),str(c),str(t)) for a,b,c,t - in zip(df[x], df[y], df[z], df[color])] - scatter3d = Scatter3d( - name=name, - x=df[x], - y=df[y], - z=df[z], - mode='markers', - text=text, - hoverinfo='text', - marker=dict( - color=self._get_numeric_values(df, color), - colorscale='Viridis', - showscale=showscale, - cmin=cmin, - cmax=cmax, - ) - ) - fig.append_trace(scatter3d, 1, scatter3d_index) - fig.layout['scene' + str(scatter3d_index)].xaxis.title = x - fig.layout['scene' + str(scatter3d_index)].yaxis.title = y - fig.layout['scene' + str(scatter3d_index)].zaxis.title = z - scatter3d_index += 1 - fig.layout.width = min(500 * len(names), 1200) - fig.layout.height = 500 - iplot(fig) - - def _plot_x(self, names, x): - self._histogram(names, x) - if x != self._target_name: - self._scatter_plot(names, x, self._target_name, self._target_name) - - def _plot_xy(self, names, x, y): - self._scatter_plot(names, x, y, self._target_name) - - def _plot_xyz(self, names, x, y, z): - self._scatter3d_plot(names, x, y, z, self._target_name) - - def profile(self, names=None, columns=None): - """Print profiles of the dataset. - - Args: - names: the names of the data to plot. Such as ['train']. If None, all data in the datasets - will be used. - columns: The list of column names to plot correlations. If None, all numeric columns - will be used. - """ - self._load_to_dataframes() - if names is None: - names = self._raw_dataframes.keys() - html = '' - for name in names: - df = self._raw_dataframes[name] - html += '' + \ - '%s
Job Running...
', raw=True) - log_file_html = '' - log_url_prefix = '' - if datalab.context._utils._in_datalab_docker(): - log_url_prefix = '/_nocachecontent/' - for job_type, replicas in replica_spec.iteritems(): - if replicas > 0: - log_file_html += ('%s log ' - % (log_url_prefix + job_type, job_type)) - IPython.display.display_html(log_file_html, raw=True) - IPython.display.display_html('Job Finished.
', raw=True) - - -def _output_train_template(): - content = """%%mlalpha train [--cloud] -package_uris: gs://your-bucket/my-training-package.tar.gz -python_module: your_program.your_module -scale_tier: BASIC -region: us-central1 -args: - string_arg: value - int_arg: value - appendable_arg: - - value1 - - value2 -""" - IPython.get_ipython().set_next_input(content) - parameters = ['package_uris', 'python_module', 'scale_tier', 'region', 'args'] - required_local = [False, False, False, False, False] - required_cloud = [True, True, True, True, False] - description = [ - 'A GCS or local (for local run only) path to your python training program package.', - 'The module to run.', - 'Type of resources requested for the job. On local run, BASIC means 1 master process only, ' + - 'and any other values mean 1 master 1 worker and 1 ps processes. But you can also ' + - 'override the values by setting worker_count and parameter_server_count. ' + - 'On cloud, see service definition for possible values.', - 'Where the training job runs. For cloud run only.', - 'Args that will be passed to your training program.' - ] - data = [{'Parameters': x[0], 'Local Run Required': str(x[1]), - 'Cloud Run Required': str(x[2]), 'Description': x[3]} - for x in zip(parameters, required_local, required_cloud, description)] - html = ('A training input template is created in next cell for you. See cell input ' + - 'instructions below.
') - html += datalab.utils.commands.HtmlBuilder.render_table(data, - ['Parameters', 'Local Run Required', 'Cloud Run Required', 'Description']) - - return IPython.core.display.HTML(html) - - -def _train(args, cell): - """ Train a model. """ - if not cell: - return _output_train_template() - - env = datalab.utils.commands.notebook_environment() - config = datalab.utils.commands.parse_config(cell, env) - if args['cloud']: - datalab.utils.commands.validate_config_must_have(config, - ['package_uris', 'python_module', 'scale_tier', 'region']) - runner = datalab.mlalpha.CloudRunner(config) - job_info = runner.run() - job_short_name = job_info['jobId'] - html = 'Job "%s" was submitted successfully.
' % job_short_name
- html += 'Run "%%mlalpha jobs --name %s" to view the status of the job.
Click here to view cloud log.
' % log_url
- html += 'Start TensorBoard by running "%tensorboard start --logdir=<YourLogDir>".
TensorBoard was started successfully with pid %d. ' % pid - html += 'Click here to access it.
' % url - return IPython.core.display.HTML(html) - - -def _stop(args, _): - """ Stop a TensorBoard instance. """ - datalab.mlalpha.TensorBoardManager.stop(int(args['pid'])) - diff --git a/datalab/notebook/static/extern/lantern-browser.html b/datalab/notebook/static/extern/lantern-browser.html new file mode 100644 index 000000000..954b67f1f --- /dev/null +++ b/datalab/notebook/static/extern/lantern-browser.html @@ -0,0 +1,5424 @@ + + +Click here to track preprocessing job.
' \
+ % dataflow_url
+ IPython.display.display_html(html, raw=True)
+
+
+def local_train(input_dir, batch_size, max_steps, output_dir, checkpoint=None):
+ """Train model locally. The output can be used for local prediction or for online deployment.
+ Args:
+ input_dir: A directory path containing preprocessed results. Can be local or GCS path.
+ batch_size: size of batch used for training.
+ max_steps: number of steps to train.
+ output_dir: The output directory to use. Can be local or GCS path.
+ checkpoint: the Inception checkpoint to use.
+ """
+
+ logger = logging.getLogger()
+ original_level = logger.getEffectiveLevel()
+ logger.setLevel(logging.INFO)
+ print 'Local training...'
+ try:
+ _local.Local(checkpoint).train(input_dir, batch_size, max_steps, output_dir)
+ finally:
+ logger.setLevel(original_level)
+ print 'Done'
+
+
+def cloud_train(input_dir, batch_size, max_steps, output_dir,
+ cloud_train_config, checkpoint=None):
+ """Train model in the cloud with CloudML trainer service.
+ The output can be used for local prediction or for online deployment.
+ Args:
+ input_dir: A directory path containing preprocessed results. GCS path only.
+ batch_size: size of batch used for training.
+ max_steps: number of steps to train.
+ output_dir: The output directory to use. GCS path only.
+ cloud_train_config: a datalab.ml.CloudTrainingConfig object.
+ checkpoint: the Inception checkpoint to use.
+ """
+
+ job = _cloud.Cloud(checkpoint=checkpoint).train(input_dir, batch_size,
+ max_steps, output_dir, cloud_train_config)
+ if (_util.is_in_IPython()):
+ import IPython
+ log_url_query_strings = {
+ 'project': _util.default_project(),
+ 'resource': 'ml.googleapis.com/job_id/' + job.info['jobId']
+ }
+ log_url = 'https://console.developers.google.com/logs/viewer?' + \
+ urllib.urlencode(log_url_query_strings)
+ html = 'Job "%s" submitted.' % job.info['jobId']
+ html += '
Click here to view cloud log.
' % log_url
+ IPython.display.display_html(html, raw=True)
+
+
+def _display_predict_results(results, show_image):
+ if (_util.is_in_IPython()):
+ import IPython
+ for image_url, image, label_and_score in results:
+ if show_image is True:
+ IPython.display.display_html('
%s(%.5f)
' % label_and_score, + raw=True) + IPython.display.display(IPython.display.Image(data=image)) + else: + IPython.display.display_html( + '%s    %s(%.5f)
' % ((image_url,) + label_and_score), raw=True) + else: + print results + + +def local_predict(model_dir, image_files, resize=False, show_image=True): + """Predict using an offline model. + Args: + model_dir: The directory of a trained inception model. Can be local or GCS paths. + image_files: The paths to the image files to predict labels. Can be local or GCS paths. + show_image: Whether to show images in the results. + resize: Whether to resize the image to a reasonable size (300x300) before prediction. + """ + print('Predicting...') + images = _util.load_images(image_files, resize=resize) + labels_and_scores = _local.Local().predict(model_dir, images) + results = zip(image_files, images, labels_and_scores) + _display_predict_results(results, show_image) + print('Done') + + +def cloud_predict(model_id, image_files, resize=False, show_image=True): + """Predict using a deployed (online) model. + Args: + model_id: The deployed model id in the form of "model.version". + image_files: The paths to the image files to predict labels. GCS paths only. + show_image: Whether to show images in the results. + resize: Whether to resize the image to a reasonable size (300x300) before prediction. + Set it to True if your images are too large to send over network. + """ + print('Predicting...') + images = _util.load_images(image_files, resize=resize) + labels_and_scores = _cloud.Cloud().predict(model_id, images) + results = zip(image_files, images, labels_and_scores) + _display_predict_results(results, show_image) + print('Done') + + +def local_batch_predict(dataset, model_dir, output_csv=None, output_bq_table=None): + """Batch predict running locally. + Args: + dataset: CsvDataSet or BigQueryDataSet for batch prediction input. Can contain either + one column 'image_url', or two columns with another being 'label'. + model_dir: The directory of a trained inception model. Can be local or GCS paths. + output_csv: The output csv file for prediction results. If specified, + it will also output a csv schema file with the name output_csv + '.schema.json'. + output_bq_table: if specified, the output BigQuery table for prediction results. + output_csv and output_bq_table can both be set. + Raises: + ValueError if both output_csv and output_bq_table are None. + """ + + if output_csv is None and output_bq_table is None: + raise ValueError('output_csv and output_bq_table cannot both be None.') + + print('Predicting...') + _local.Local().batch_predict(dataset, model_dir, output_csv, output_bq_table) + print('Done') + + +def cloud_batch_predict(dataset, model_dir, gcs_staging_location, + output_csv=None, output_bq_table=None, pipeline_option=None): + """Batch predict running in cloud. + + Args: + dataset: CsvDataSet or BigQueryDataSet for batch prediction input. Can contain either + one column 'image_url', or two columns with another being 'label'. + model_dir: A GCS path to a trained inception model directory. + gcs_staging_location: A temporary location for DataFlow staging. + output_csv: If specified, prediction results will be saved to the specified Csv file. + It will also output a csv schema file with the name output_csv + '.schema.json'. + GCS file path only. + output_bq_table: If specified, prediction results will be saved to the specified BigQuery + table. output_csv and output_bq_table can both be set, but cannot be both None. + pipeline_option: DataFlow pipeline options in a dictionary. + Raises: + ValueError if both output_csv and output_bq_table are None. + """ + + if output_csv is None and output_bq_table is None: + raise ValueError('output_csv and output_bq_table cannot both be None.') + + job_name = _cloud.Cloud().batch_predict(dataset, model_dir, + gcs_staging_location, output_csv, output_bq_table, pipeline_option) + if (_util.is_in_IPython()): + import IPython + + dataflow_url = ('https://console.developers.google.com/dataflow?project=%s' % + _util.default_project()) + html = 'Job "%s" submitted.' % job_name + html += ('Click here to track batch prediction job.
'
+ % dataflow_url)
+ IPython.display.display_html(html, raw=True)
diff --git a/solutionbox/inception/datalab_solutions/inception/_predictor.py b/solutionbox/inception/datalab_solutions/inception/_predictor.py
new file mode 100644
index 000000000..03f3974f6
--- /dev/null
+++ b/solutionbox/inception/datalab_solutions/inception/_predictor.py
@@ -0,0 +1,226 @@
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""Local implementation for preprocessing, training and prediction for inception model.
+"""
+
+import apache_beam as beam
+import collections
+import json
+import os
+import tensorflow as tf
+
+from . import _model
+from . import _util
+
+
+def _tf_predict(model_dir, images):
+ model_dir = os.path.join(model_dir, 'model')
+ with tf.Session() as sess:
+ new_saver = tf.train.import_meta_graph(os.path.join(model_dir, 'export.meta'))
+ new_saver.restore(sess, os.path.join(model_dir, 'export'))
+ init_op = tf.get_collection(tf.contrib.session_bundle.constants.INIT_OP_KEY)[0]
+ sess.run(init_op)
+ inputs = json.loads(tf.get_collection('inputs')[0])
+ outputs = json.loads(tf.get_collection('outputs')[0])
+ feed_dict = collections.defaultdict(list)
+ for ii, image in enumerate(images):
+ feed_dict[inputs['image_bytes']].append(image)
+ feed_dict[inputs['key']].append(str(ii))
+ predictions, labels, scores = sess.run(
+ [outputs['prediction'], outputs['labels'], outputs['scores']], feed_dict=feed_dict)
+ return zip(predictions, labels, scores)
+
+
+def predict(model_dir, images):
+ """Local instant prediction."""
+
+ results = _tf_predict(model_dir, images)
+ predicted_and_scores = [(predicted, label_scores[list(labels).index(predicted)])
+ for predicted, labels, label_scores in results]
+ return predicted_and_scores
+
+
+# Helpers for batch prediction dataflow pipeline
+
+class EmitAsBatchDoFn(beam.DoFn):
+ """A DoFn that buffers the records and emits them batch by batch."""
+
+ def __init__(self, batch_size):
+ self._batch_size = batch_size
+ self._cached = []
+
+ def process(self, element):
+ self._cached.append(element)
+ if len(self._cached) >= self._batch_size:
+ emit = self._cached
+ self._cached = []
+ yield emit
+
+ def finish_bundle(self, context=None):
+ if len(self._cached) > 0: # pylint: disable=g-explicit-length-test
+ yield self._cached
+
+
+class UnbatchDoFn(beam.DoFn):
+ """A DoFn expand batch into elements."""
+
+ def process(self, element):
+ for item in element:
+ yield item
+
+
+class LoadImagesDoFn(beam.DoFn):
+ """A DoFn that reads image from url."""
+
+ def process(self, element):
+ with _util.open_local_or_gcs(element['image_url'], 'r') as ff:
+ image_bytes = ff.read()
+ out_element = {'image_bytes': image_bytes}
+ out_element.update(element)
+ yield out_element
+
+
+class PredictBatchDoFn(beam.DoFn):
+ """A DoFn that does batch prediction."""
+
+ def __init__(self, model_dir):
+ import os
+
+ self._model_dir = os.path.join(model_dir, 'model')
+ self._session = None
+ self._tf_inputs = None
+ self._tf_outputs = None
+
+ def start_bundle(self, context=None):
+ import json
+ import os
+ import tensorflow as tf
+
+ self._session = tf.Session()
+ new_saver = tf.train.import_meta_graph(os.path.join(self._model_dir, 'export.meta'))
+ new_saver.restore(self._session, os.path.join(self._model_dir, 'export'))
+ init_op = tf.get_collection(tf.contrib.session_bundle.constants.INIT_OP_KEY)[0]
+ self._session.run(init_op)
+ self._tf_inputs = json.loads(tf.get_collection('inputs')[0])
+ self._tf_outputs = json.loads(tf.get_collection('outputs')[0])
+
+ def finish_bundle(self, context=None):
+ if self._session is not None:
+ self._session.close()
+
+ def process(self, element):
+ import collections
+
+ image_urls = [x['image_url'] for x in element]
+ targets = None
+ if 'label' in element[0] and element[0]['label'] is not None:
+ targets = [x['label'] for x in element]
+
+ feed_dict = collections.defaultdict(list)
+ feed_dict[self._tf_inputs['image_bytes']] = [x['image_bytes'] for x in element]
+ feed_dict[self._tf_inputs['key']] = image_urls
+ predictions, labels, scores = self._session.run(
+ [self._tf_outputs['prediction'], self._tf_outputs['labels'], self._tf_outputs['scores']],
+ feed_dict=feed_dict)
+ if targets is not None:
+ yield zip(image_urls, targets, predictions, labels, scores)
+ else:
+ yield zip(image_urls, predictions, labels, scores)
+
+
+class ProcessResultsDoFn(beam.DoFn):
+ """A DoFn that process prediction results by casting values and calculating
+ target_prob.
+ """
+
+ def process(self, element):
+ target = None
+ if len(element) == 5:
+ image_url, target, prediction, labels, scores = element
+ else:
+ image_url, prediction, labels, scores = element
+ labels = list(labels)
+ predicted_prob = scores[labels.index(prediction)]
+ out_element = {
+ 'image_url': image_url,
+ 'predicted': prediction,
+ # Convert to float from np.float32 because BigQuery Sink can only handle intrinsic types.
+ 'predicted_prob': float(predicted_prob)
+ }
+ if target is not None:
+ target_prob = scores[labels.index(target)] if target in labels else 0.0
+ out_element['target_prob'] = float(target_prob)
+ out_element['target'] = target
+ yield out_element
+
+
+class MakeCsvLineDoFn(beam.DoFn):
+ """A DoFn that makes CSV lines out of prediction results."""
+
+ def process(self, element):
+ import csv
+ import StringIO
+
+ line = StringIO.StringIO()
+ if len(element) == 5:
+ csv.DictWriter(line,
+ ['image_url', 'target', 'predicted', 'target_prob', 'predicted_prob']).writerow(element)
+ else:
+ csv.DictWriter(line, ['image_url', 'predicted', 'predicted_prob']).writerow(element)
+ yield line.getvalue()
+
+
+def configure_pipeline(p, dataset, model_dir, output_csv, output_bq_table):
+ """Configures a dataflow pipeline for batch prediction."""
+
+ data = _util.get_sources_from_dataset(p, dataset, 'predict')
+ if len(dataset.schema) == 2:
+ output_schema = [
+ {'name': 'image_url', 'type': 'STRING'},
+ {'name': 'target', 'type': 'STRING'},
+ {'name': 'predicted', 'type': 'STRING'},
+ {'name': 'target_prob', 'type': 'FLOAT'},
+ {'name': 'predicted_prob', 'type': 'FLOAT'},
+ ]
+ else:
+ output_schema = [
+ {'name': 'image_url', 'type': 'STRING'},
+ {'name': 'predicted', 'type': 'STRING'},
+ {'name': 'predicted_prob', 'type': 'FLOAT'},
+ ]
+ results = (data
+ | 'Load Images' >> beam.ParDo(LoadImagesDoFn())
+ | 'Batch Inputs' >> beam.ParDo(EmitAsBatchDoFn(20))
+ | 'Batch Predict' >> beam.ParDo(PredictBatchDoFn(model_dir))
+ | 'Unbatch' >> beam.ParDo(UnbatchDoFn())
+ | 'Process Results' >> beam.ParDo(ProcessResultsDoFn()))
+
+ if output_csv is not None:
+ schema_file = output_csv + '.schema.json'
+ results_save = (results
+ | 'Prepare For Output' >> beam.ParDo(MakeCsvLineDoFn())
+ | 'Write Csv Results' >> beam.io.textio.WriteToText(output_csv, shard_name_template=''))
+ (results_save
+ | beam.transforms.combiners.Sample.FixedSizeGlobally('Sample One', 1)
+ | 'Serialize Schema' >> beam.Map(lambda path: json.dumps(output_schema))
+ | 'Write Schema' >> beam.io.textio.WriteToText(schema_file, shard_name_template=''))
+ if output_bq_table is not None:
+ # BigQuery sink takes schema in the form of 'field1:type1,field2:type2...'
+ bq_schema_string = ','.join(x['name'] + ':' + x['type'] for x in output_schema)
+ sink = beam.io.BigQuerySink(output_bq_table, schema=bq_schema_string,
+ write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)
+ results | 'Write BQ Results' >> beam.io.Write(sink)
+
diff --git a/solutionbox/inception/datalab_solutions/inception/_preprocess.py b/solutionbox/inception/datalab_solutions/inception/_preprocess.py
new file mode 100644
index 000000000..a79fed671
--- /dev/null
+++ b/solutionbox/inception/datalab_solutions/inception/_preprocess.py
@@ -0,0 +1,363 @@
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""Preprocess pipeline implementation with Cloud DataFlow.
+"""
+
+
+import apache_beam as beam
+from apache_beam.io import fileio
+from apache_beam.io import tfrecordio
+from apache_beam.metrics import Metrics
+from apache_beam.utils.pipeline_options import PipelineOptions
+import cStringIO
+import csv
+import logging
+import os
+from PIL import Image
+import tensorflow as tf
+
+from . import _inceptionlib
+from . import _util
+
+
+slim = tf.contrib.slim
+
+error_count = Metrics.counter('main', 'errorCount')
+rows_count = Metrics.counter('main', 'rowsCount')
+skipped_empty_line = Metrics.counter('main', 'skippedEmptyLine')
+embedding_good = Metrics.counter('main', 'embedding_good')
+embedding_bad = Metrics.counter('main', 'embedding_bad')
+incompatible_image = Metrics.counter('main', 'incompatible_image')
+invalid_uri = Metrics.counter('main', 'invalid_file_name')
+unlabeled_image = Metrics.counter('main', 'unlabeled_image')
+
+
+class ExtractLabelIdsDoFn(beam.DoFn):
+ """Extracts (uri, label_ids) tuples from CSV rows.
+ """
+
+ def start_bundle(self, context=None):
+ self.label_to_id_map = {}
+
+ def process(self, element, all_labels):
+ all_labels = list(all_labels)
+ # DataFlow cannot garuantee the order of the labels when materializing it.
+ # The labels materialized and consumed by training may not be with the same order
+ # as the one used in preprocessing. So we need to sort it in both preprocessing
+ # and training so the order matches.
+ all_labels.sort()
+ if not self.label_to_id_map:
+ for i, label in enumerate(all_labels):
+ label = label.strip()
+ if label:
+ self.label_to_id_map[label] = i
+
+ # Row format is:
+ # image_uri,label_id
+ if not element:
+ skipped_empty_line.inc()
+ return
+
+ rows_count.inc()
+ uri = element['image_url']
+ if not uri or not uri.startswith('gs://'):
+ invalid_uri.inc()
+ return
+
+ try:
+ label_id = self.label_to_id_map[element['label'].strip()]
+ except KeyError:
+ unlabeled_image.inc()
+ yield uri, label_id
+
+
+class ReadImageAndConvertToJpegDoFn(beam.DoFn):
+ """Read files from GCS and convert images to JPEG format.
+
+ We do this even for JPEG images to remove variations such as different number
+ of channels.
+ """
+
+ def process(self, element):
+ uri, label_id = element
+
+ try:
+ with _util.open_local_or_gcs(uri, mode='r') as f:
+ img = Image.open(f).convert('RGB')
+ # A variety of different calling libraries throw different exceptions here.
+ # They all correspond to an unreadable file so we treat them equivalently.
+ # pylint: disable broad-except
+ except Exception as e:
+ logging.exception('Error processing image %s: %s', uri, str(e))
+ error_count.inc()
+ return
+
+ # Convert to desired format and output.
+ output = cStringIO.StringIO()
+ img.save(output, 'jpeg')
+ image_bytes = output.getvalue()
+ yield uri, label_id, image_bytes
+
+
+class EmbeddingsGraph(object):
+ """Builds a graph and uses it to extract embeddings from images.
+ """
+
+ # These constants are set by Inception v3's expectations.
+ WIDTH = 299
+ HEIGHT = 299
+ CHANNELS = 3
+
+ def __init__(self, tf_session, checkpoint_path):
+ self.tf_session = tf_session
+ # input_jpeg is the tensor that contains raw image bytes.
+ # It is used to feed image bytes and obtain embeddings.
+ self.input_jpeg, self.embedding = self.build_graph()
+ self.tf_session.run(tf.global_variables_initializer())
+ self.restore_from_checkpoint(checkpoint_path)
+
+ def build_graph(self):
+ """Forms the core by building a wrapper around the inception graph.
+
+ Here we add the necessary input & output tensors, to decode jpegs,
+ serialize embeddings, restore from checkpoint etc.
+
+ To use other Inception models modify this file. Note that to use other
+ models beside Inception, you should make sure input_shape matches
+ their input. Resizing or other modifications may be necessary as well.
+ See tensorflow/contrib/slim/python/slim/nets/inception_v3.py for
+ details about InceptionV3.
+
+ Returns:
+ input_jpeg: A tensor containing raw image bytes as the input layer.
+ embedding: The embeddings tensor, that will be materialized later.
+ """
+
+ input_jpeg = tf.placeholder(tf.string, shape=None)
+ image = tf.image.decode_jpeg(input_jpeg, channels=self.CHANNELS)
+
+ # Note resize expects a batch_size, but we are feeding a single image.
+ # So we have to expand then squeeze. Resize returns float32 in the
+ # range [0, uint8_max]
+ image = tf.expand_dims(image, 0)
+
+ # convert_image_dtype also scales [0, uint8_max] -> [0 ,1).
+ image = tf.image.convert_image_dtype(image, dtype=tf.float32)
+ image = tf.image.resize_bilinear(
+ image, [self.HEIGHT, self.WIDTH], align_corners=False)
+
+ # Then rescale range to [-1, 1) for Inception.
+ image = tf.subtract(image, 0.5)
+ inception_input = tf.multiply(image, 2.0)
+
+ # Build Inception layers, which expect a tensor of type float from [-1, 1)
+ # and shape [batch_size, height, width, channels].
+ with slim.arg_scope(_inceptionlib.inception_v3_arg_scope()):
+ _, end_points = _inceptionlib.inception_v3(inception_input, is_training=False)
+
+ embedding = end_points['PreLogits']
+ return input_jpeg, embedding
+
+ def restore_from_checkpoint(self, checkpoint_path):
+ """To restore inception model variables from the checkpoint file.
+
+ Some variables might be missing in the checkpoint file, so it only
+ loads the ones that are avialable, assuming the rest would be
+ initialized later.
+ Args:
+ checkpoint_path: Path to the checkpoint file for the Inception graph.
+ """
+ # Get all variables to restore. Exclude Logits and AuxLogits because they
+ # depend on the input data and we do not need to intialize them from
+ # checkpoint.
+ all_vars = tf.contrib.slim.get_variables_to_restore(
+ exclude=['InceptionV3/AuxLogits', 'InceptionV3/Logits', 'global_step'])
+
+ saver = tf.train.Saver(all_vars)
+ saver.restore(self.tf_session, checkpoint_path)
+
+ def calculate_embedding(self, batch_image_bytes):
+ """Get the embeddings for a given JPEG image.
+
+ Args:
+ batch_image_bytes: As if returned from [ff.read() for ff in file_list].
+
+ Returns:
+ The Inception embeddings (bottleneck layer output)
+ """
+ return self.tf_session.run(
+ self.embedding, feed_dict={self.input_jpeg: batch_image_bytes})
+
+
+class TFExampleFromImageDoFn(beam.DoFn):
+ """Embeds image bytes and labels, stores them in tensorflow.Example.
+
+ (uri, label_ids, image_bytes) -> (tensorflow.Example).
+
+ Output proto contains 'label', 'image_uri' and 'embedding'.
+ The 'embedding' is calculated by feeding image into input layer of image
+ neural network and reading output of the bottleneck layer of the network.
+
+ Attributes:
+ image_graph_uri: an uri to gcs bucket where serialized image graph is
+ stored.
+ """
+
+ def __init__(self, checkpoint_path):
+ self.tf_session = None
+ self.graph = None
+ self.preprocess_graph = None
+ self._checkpoint_path = checkpoint_path
+
+ def start_bundle(self, context=None):
+ # There is one tensorflow session per instance of TFExampleFromImageDoFn.
+ # The same instance of session is re-used between bundles.
+ # Session is closed by the destructor of Session object, which is called
+ # when instance of TFExampleFromImageDoFn() is destructed.
+ if not self.graph:
+ self.graph = tf.Graph()
+ self.tf_session = tf.InteractiveSession(graph=self.graph)
+ with self.graph.as_default():
+ self.preprocess_graph = EmbeddingsGraph(self.tf_session, self._checkpoint_path)
+
+ def finish_bundle(self, context=None):
+ if self.tf_session is not None:
+ self.tf_session.close()
+
+ def process(self, element):
+
+ def _bytes_feature(value):
+ return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))
+
+ def _float_feature(value):
+ return tf.train.Feature(float_list=tf.train.FloatList(value=value))
+
+ uri, label_id, image_bytes = element
+
+ try:
+ embedding = self.preprocess_graph.calculate_embedding(image_bytes)
+ except tf.errors.InvalidArgumentError as e:
+ incompatible_image.inc()
+ logging.warning('Could not encode an image from %s: %s', uri, str(e))
+ return
+
+ if embedding.any():
+ embedding_good.inc()
+ else:
+ embedding_bad.inc()
+
+ example = tf.train.Example(features=tf.train.Features(feature={
+ 'image_uri': _bytes_feature([str(uri)]),
+ 'embedding': _float_feature(embedding.ravel().tolist()),
+ }))
+
+ example.features.feature['label'].int64_list.value.append(label_id)
+
+ yield example
+
+
+class TrainEvalSplitPartitionFn(beam.PartitionFn):
+ """Split train and eval data."""
+ def partition_for(self, element, num_partitions):
+ import random
+ return 1 if random.random() > 0.7 else 0
+
+
+class ExampleProtoCoder(beam.coders.Coder):
+ """A coder to encode and decode TensorFlow Example objects."""
+
+ def __init__(self):
+ import tensorflow as tf # pylint: disable=g-import-not-at-top
+ self._tf_train = tf.train
+
+ def encode(self, example_proto):
+ return example_proto.SerializeToString()
+
+ def decode(self, serialized_str):
+ example = self._tf_train.Example()
+ example.ParseFromString(serialized_str)
+ return example
+
+
+class SaveFeatures(beam.PTransform):
+ """Save Features in a TFRecordIO format.
+ """
+
+ def __init__(self, file_path_prefix):
+ super(SaveFeatures, self).__init__('SaveFeatures')
+ self._file_path_prefix = file_path_prefix
+
+ def expand(self, features):
+ return (features
+ | 'Write to %s' % self._file_path_prefix.replace('/', '_')
+ >> tfrecordio.WriteToTFRecord(
+ file_path_prefix=self._file_path_prefix,
+ file_name_suffix='.tfrecord.gz',
+ shard_name_template=fileio.DEFAULT_SHARD_NAME_TEMPLATE,
+ coder=ExampleProtoCoder(),
+ compression_type=fileio.CompressionTypes.AUTO))
+
+
+def _labels_pipeline(sources):
+ labels = (sources
+ | 'Flatten Sources for labels' >> beam.Flatten()
+ | 'Parse input for labels' >> beam.Map(lambda x: str(x['label']))
+ | 'Combine labels' >> beam.transforms.combiners.Count.PerElement()
+ | 'Get labels' >> beam.Map(lambda label_count: label_count[0]))
+ return labels
+
+
+def _transformation_pipeline(source, checkpoint, labels, mode):
+ transformed = (source
+ | 'Extract label ids(%s)' % mode >> beam.ParDo(ExtractLabelIdsDoFn(),
+ beam.pvalue.AsIter(labels))
+ | 'Read and convert to JPEG(%s)' % mode >> beam.ParDo(ReadImageAndConvertToJpegDoFn())
+ | 'Embed and make TFExample(%s)' % mode >>
+ beam.ParDo(TFExampleFromImageDoFn(checkpoint)))
+ return transformed
+
+
+def configure_pipeline(p, dataset_train, dataset_eval, checkpoint_path, output_dir, job_id):
+ source_train = _util.get_sources_from_dataset(p, dataset_train, 'train')
+ labels_source = [source_train]
+ if dataset_eval is not None:
+ source_eval = _util.get_sources_from_dataset(p, dataset_eval, 'eval')
+ labels_source.append(source_eval)
+
+ labels = _labels_pipeline(labels_source)
+ train_preprocessed = _transformation_pipeline(source_train, checkpoint_path, labels, 'train')
+ if dataset_eval is not None:
+ # explicit eval data.
+ eval_preprocessed = _transformation_pipeline(source_eval, checkpoint_path, labels, 'eval')
+ else:
+ # Split train/eval.
+ train_preprocessed, eval_preprocessed = (train_preprocessed |
+ 'Random Partition' >> beam.Partition(TrainEvalSplitPartitionFn(), 2))
+
+ output_train_path = os.path.join(output_dir, job_id, 'train')
+ output_eval_path = os.path.join(output_dir, job_id, 'eval')
+ labels_file = os.path.join(output_dir, job_id, 'labels')
+ labels_save = (labels
+ | 'Write labels' >> beam.io.textio.WriteToText(labels_file, shard_name_template=''))
+ train_save = train_preprocessed | 'Save train to disk' >> SaveFeatures(output_train_path)
+ eval_save = eval_preprocessed | 'Save eval to disk' >> SaveFeatures(output_eval_path)
+ # Make sure we write "latest" file after train and eval data are successfully written.
+ output_latest_file = os.path.join(output_dir, 'latest')
+ ([eval_save, train_save, labels_save] | 'Wait for train eval saving' >> beam.Flatten() |
+ beam.transforms.combiners.Sample.FixedSizeGlobally('Fixed One', 1) |
+ beam.Map(lambda path: job_id) |
+ 'WriteLatest' >> beam.io.textio.WriteToText(output_latest_file, shard_name_template=''))
+
diff --git a/solutionbox/inception/datalab_solutions/inception/_trainer.py b/solutionbox/inception/datalab_solutions/inception/_trainer.py
new file mode 100644
index 000000000..8003ee0ba
--- /dev/null
+++ b/solutionbox/inception/datalab_solutions/inception/_trainer.py
@@ -0,0 +1,274 @@
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""Training implementation for inception model.
+"""
+
+import logging
+import os
+import tensorflow as tf
+import time
+
+from . import _util
+
+
+def start_server(cluster, task):
+ if not task.type:
+ raise ValueError('--task_type must be specified.')
+ if task.index is None:
+ raise ValueError('--task_index must be specified.')
+
+ # Create and start a server.
+ return tf.train.Server(
+ tf.train.ClusterSpec(cluster),
+ protocol='grpc',
+ job_name=task.type,
+ task_index=task.index)
+
+class Evaluator(object):
+ """Loads variables from latest checkpoint and performs model evaluation."""
+
+ def __init__(self, model, data_paths, batch_size, output_path, dataset='eval'):
+ data_size = self._data_size(data_paths)
+ if data_size <= batch_size:
+ raise Exception('Data size is smaller than batch size.')
+ self.num_eval_batches = data_size // batch_size
+ self.batch_of_examples = []
+ self.checkpoint_path = os.path.join(output_path, 'train')
+ self.output_path = os.path.join(output_path, dataset)
+ self.eval_data_paths = data_paths
+ self.batch_size = batch_size
+ self.model = model
+
+
+ def _data_size(self, data_paths):
+ n = 0
+ options = tf.python_io.TFRecordOptions(
+ compression_type=tf.python_io.TFRecordCompressionType.GZIP)
+ for file in data_paths:
+ for line in tf.python_io.tf_record_iterator(file, options=options):
+ n += 1
+ return n
+
+ def evaluate(self, num_eval_batches=None):
+ """Run one round of evaluation, return loss and accuracy."""
+
+ num_eval_batches = num_eval_batches or self.num_eval_batches
+ with tf.Graph().as_default() as graph:
+ self.tensors = self.model.build_eval_graph(self.eval_data_paths,
+ self.batch_size)
+ self.summary = tf.summary.merge_all()
+ self.saver = tf.train.Saver()
+
+ self.summary_writer = tf.summary.FileWriter(self.output_path)
+ self.sv = tf.train.Supervisor(
+ graph=graph,
+ logdir=self.output_path,
+ summary_op=None,
+ global_step=None,
+ saver=self.saver)
+
+ last_checkpoint = tf.train.latest_checkpoint(self.checkpoint_path)
+ with self.sv.managed_session(
+ master='', start_standard_services=False) as session:
+ self.sv.saver.restore(session, last_checkpoint)
+
+ if not self.batch_of_examples:
+ self.sv.start_queue_runners(session)
+ for i in range(num_eval_batches):
+ self.batch_of_examples.append(session.run(self.tensors.examples))
+
+ for i in range(num_eval_batches):
+ session.run(self.tensors.metric_updates,
+ {self.tensors.examples: self.batch_of_examples[i]})
+
+ metric_values = session.run(self.tensors.metric_values)
+ global_step = tf.train.global_step(session, self.tensors.global_step)
+ summary = session.run(self.summary)
+ self.summary_writer.add_summary(summary, global_step)
+ self.summary_writer.flush()
+ return metric_values
+
+
+
+class Trainer(object):
+ """Performs model training and optionally evaluation."""
+
+ def __init__(self, input_dir, batch_size, max_steps, output_path, model, cluster, task):
+ train_files, eval_files = _util.get_train_eval_files(input_dir)
+ self.train_data_paths = train_files
+ self.output_path = output_path
+ self.batch_size = batch_size
+ self.model = model
+ self.max_steps = max_steps
+ self.cluster = cluster
+ self.task = task
+ self.evaluator = Evaluator(self.model, eval_files, batch_size, output_path, 'eval_set')
+ self.train_evaluator = Evaluator(self.model, train_files, batch_size, output_path, 'train_set')
+ self.min_train_eval_rate = 8
+
+ def run_training(self):
+ """Runs a Master."""
+ self.train_path = os.path.join(self.output_path, 'train')
+ self.model_path = os.path.join(self.output_path, 'model')
+ self.is_master = self.task.type != 'worker'
+ log_interval = 15
+ self.eval_interval = 30
+ if self.is_master and self.task.index > 0:
+ raise StandardError('Only one replica of master expected')
+
+ if self.cluster:
+ logging.info('Starting %s/%d', self.task.type, self.task.index)
+ server = start_server(self.cluster, self.task)
+ target = server.target
+ device_fn = tf.train.replica_device_setter(
+ ps_device='/job:ps',
+ worker_device='/job:%s/task:%d' % (self.task.type, self.task.index),
+ cluster=self.cluster)
+ # We use a device_filter to limit the communication between this job
+ # and the parameter servers, i.e., there is no need to directly
+ # communicate with the other workers; attempting to do so can result
+ # in reliability problems.
+ device_filters = [
+ '/job:ps', '/job:%s/task:%d' % (self.task.type, self.task.index)
+ ]
+ config = tf.ConfigProto(device_filters=device_filters)
+ else:
+ target = ''
+ device_fn = ''
+ config = None
+
+ with tf.Graph().as_default() as graph:
+ with tf.device(device_fn):
+ # Build the training graph.
+ self.tensors = self.model.build_train_graph(self.train_data_paths,
+ self.batch_size)
+
+ # Add the variable initializer Op.
+ init_op = tf.global_variables_initializer()
+
+ # Create a saver for writing training checkpoints.
+ self.saver = tf.train.Saver()
+
+ # Build the summary operation based on the TF collection of Summaries.
+ self.summary_op = tf.summary.merge_all()
+
+ # Create a "supervisor", which oversees the training process.
+ self.sv = tf.train.Supervisor(
+ graph,
+ is_chief=self.is_master,
+ logdir=self.train_path,
+ init_op=init_op,
+ saver=self.saver,
+ # Write summary_ops by hand.
+ summary_op=None,
+ global_step=self.tensors.global_step,
+ # No saving; we do it manually in order to easily evaluate immediately
+ # afterwards.
+ save_model_secs=0)
+
+ should_retry = True
+ to_run = [self.tensors.global_step, self.tensors.train]
+
+ while should_retry:
+ try:
+ should_retry = False
+ with self.sv.managed_session(target, config=config) as session:
+ self.start_time = start_time = time.time()
+ self.last_save = self.last_log = 0
+ self.global_step = self.last_global_step = 0
+ self.local_step = self.last_local_step = 0
+ self.last_global_time = self.last_local_time = start_time
+
+ # Loop until the supervisor shuts down or max_steps have
+ # completed.
+ max_steps = self.max_steps
+ while not self.sv.should_stop() and self.global_step < max_steps:
+ try:
+ # Run one step of the model.
+ self.global_step = session.run(to_run)[0]
+ self.local_step += 1
+
+ self.now = time.time()
+ is_time_to_eval = (self.now - self.last_save) > self.eval_interval
+ is_time_to_log = (self.now - self.last_log) > log_interval
+ should_eval = self.is_master and is_time_to_eval
+ should_log = is_time_to_log or should_eval
+
+ if should_log:
+ self.log(session)
+
+ if should_eval:
+ self.eval(session)
+ except tf.errors.AbortedError:
+ should_retry = True
+
+ if self.is_master:
+ # Take the final checkpoint and compute the final accuracy.
+ # self.saver.save(session, self.sv.save_path, self.tensors.global_step)
+ self.eval(session)
+
+ except tf.errors.AbortedError:
+ print('Hitting an AbortedError. Trying it again.')
+ should_retry = True
+
+ # Export the model for inference.
+ if self.is_master:
+ self.model.export(tf.train.latest_checkpoint(self.train_path), self.model_path)
+
+ # Ask for all the services to stop.
+ self.sv.stop()
+
+ def log(self, session):
+ """Logs training progress."""
+ logging.info('Train [%s/%d], step %d (%.3f sec) %.1f '
+ 'global steps/s, %.1f local steps/s', self.task.type,
+ self.task.index, self.global_step,
+ (self.now - self.start_time),
+ (self.global_step - self.last_global_step) /
+ (self.now - self.last_global_time),
+ (self.local_step - self.last_local_step) /
+ (self.now - self.last_local_time))
+ self.last_log = self.now
+ self.last_global_step, self.last_global_time = self.global_step, self.now
+ self.last_local_step, self.last_local_time = self.local_step, self.now
+
+ def eval(self, session):
+ """Runs evaluation loop."""
+ eval_start = time.time()
+ self.saver.save(session, self.sv.save_path, self.tensors.global_step)
+ logging.info(
+ 'Eval, step %d:\n- on train set %s\n-- on eval set %s',
+ self.global_step,
+ self.model.format_metric_values(self.train_evaluator.evaluate()),
+ self.model.format_metric_values(self.evaluator.evaluate()))
+ now = time.time()
+
+ # Make sure eval doesn't consume too much of total time.
+ eval_time = now - eval_start
+ train_eval_rate = self.eval_interval / eval_time
+ if train_eval_rate < self.min_train_eval_rate and self.last_save > 0:
+ logging.info('Adjusting eval interval from %.2fs to %.2fs',
+ self.eval_interval, self.min_train_eval_rate * eval_time)
+ self.eval_interval = self.min_train_eval_rate * eval_time
+
+ self.last_save = now
+ self.last_log = now
+
+ def save_summaries(self, session):
+ self.sv.summary_computed(session,
+ session.run(self.summary_op), self.global_step)
+ self.sv.summary_writer.flush()
+
diff --git a/solutionbox/inception/datalab_solutions/inception/_util.py b/solutionbox/inception/datalab_solutions/inception/_util.py
new file mode 100644
index 000000000..8e2ad9fa9
--- /dev/null
+++ b/solutionbox/inception/datalab_solutions/inception/_util.py
@@ -0,0 +1,268 @@
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""Reusable utility functions.
+"""
+
+from apache_beam.io import gcsio
+import collections
+import glob
+import multiprocessing
+import os
+
+import tensorflow as tf
+from tensorflow.python.lib.io import file_io
+
+
+_DEFAULT_CHECKPOINT_GSURL = 'gs://cloud-ml-data/img/flower_photos/inception_v3_2016_08_28.ckpt'
+
+
+def is_in_IPython():
+ try:
+ import IPython
+ return True
+ except ImportError:
+ return False
+
+
+def default_project():
+ import datalab.context
+ context = datalab.context.Context.default()
+ return context.project_id
+
+
+def open_local_or_gcs(path, mode):
+ """Opens the given path."""
+ if path.startswith('gs://'):
+ try:
+ return gcsio.GcsIO().open(path, mode)
+ except Exception as e: # pylint: disable=broad-except
+ # Currently we retry exactly once, to work around flaky gcs calls.
+ logging.error('Retrying after exception reading gcs file: %s', e)
+ time.sleep(10)
+ return gcsio.GcsIO().open(path, mode)
+ else:
+ return open(path, mode)
+
+
+def file_exists(path):
+ """Returns whether the file exists."""
+ if path.startswith('gs://'):
+ return gcsio.GcsIO().exists(path)
+ else:
+ return os.path.exists(path)
+
+
+def glob_files(path):
+ if path.startswith('gs://'):
+ return gcsio.GcsIO().glob(path)
+ else:
+ return glob.glob(path)
+
+
+def _get_latest_data_dir(input_dir):
+ latest_file = os.path.join(input_dir, 'latest')
+ if not file_exists(latest_file):
+ raise Exception(('Cannot find "latest" file in "%s". ' +
+ 'Please use a preprocessing output dir.') % input_dir)
+ with open_local_or_gcs(latest_file, 'r') as f:
+ dir_name = f.read().rstrip()
+ return os.path.join(input_dir, dir_name)
+
+
+def get_train_eval_files(input_dir):
+ """Get preprocessed training and eval files."""
+ data_dir = _get_latest_data_dir(input_dir)
+ train_pattern = os.path.join(data_dir, 'train*.tfrecord.gz')
+ eval_pattern = os.path.join(data_dir, 'eval*.tfrecord.gz')
+ train_files = glob_files(train_pattern)
+ eval_files = glob_files(eval_pattern)
+ return train_files, eval_files
+
+
+def get_labels(input_dir):
+ """Get a list of labels from preprocessed output dir."""
+ data_dir = _get_latest_data_dir(input_dir)
+ labels_file = os.path.join(data_dir, 'labels')
+ with open_local_or_gcs(labels_file, mode='r') as f:
+ labels = f.read().rstrip().split('\n')
+ return labels
+
+
+def read_examples(input_files, batch_size, shuffle, num_epochs=None):
+ """Creates readers and queues for reading example protos."""
+ files = []
+ for e in input_files:
+ for path in e.split(','):
+ files.extend(file_io.get_matching_files(path))
+ thread_count = multiprocessing.cpu_count()
+
+ # The minimum number of instances in a queue from which examples are drawn
+ # randomly. The larger this number, the more randomness at the expense of
+ # higher memory requirements.
+ min_after_dequeue = 1000
+
+ # When batching data, the queue's capacity will be larger than the batch_size
+ # by some factor. The recommended formula is (num_threads + a small safety
+ # margin). For now, we use a single thread for reading, so this can be small.
+ queue_size_multiplier = thread_count + 3
+
+ # Convert num_epochs == 0 -> num_epochs is None, if necessary
+ num_epochs = num_epochs or None
+
+ # Build a queue of the filenames to be read.
+ filename_queue = tf.train.string_input_producer(files, num_epochs, shuffle)
+
+ options = tf.python_io.TFRecordOptions(
+ compression_type=tf.python_io.TFRecordCompressionType.GZIP)
+ example_id, encoded_example = tf.TFRecordReader(options=options).read_up_to(
+ filename_queue, batch_size)
+
+ if shuffle:
+ capacity = min_after_dequeue + queue_size_multiplier * batch_size
+ return tf.train.shuffle_batch(
+ [example_id, encoded_example],
+ batch_size,
+ capacity,
+ min_after_dequeue,
+ enqueue_many=True,
+ num_threads=thread_count)
+ else:
+ capacity = queue_size_multiplier * batch_size
+ return tf.train.batch(
+ [example_id, encoded_example],
+ batch_size,
+ capacity=capacity,
+ enqueue_many=True,
+ num_threads=thread_count)
+
+
+def override_if_not_in_args(flag, argument, args):
+ """Checks if flags is in args, and if not it adds the flag to args."""
+ if flag not in args:
+ args.extend([flag, argument])
+
+
+def loss(loss_value):
+ """Calculates aggregated mean loss."""
+ total_loss = tf.Variable(0.0, False)
+ loss_count = tf.Variable(0, False)
+ total_loss_update = tf.assign_add(total_loss, loss_value)
+ loss_count_update = tf.assign_add(loss_count, 1)
+ loss_op = total_loss / tf.cast(loss_count, tf.float32)
+ return [total_loss_update, loss_count_update], loss_op
+
+
+def accuracy(logits, labels):
+ """Calculates aggregated accuracy."""
+ is_correct = tf.nn.in_top_k(logits, labels, 1)
+ correct = tf.reduce_sum(tf.cast(is_correct, tf.int32))
+ incorrect = tf.reduce_sum(tf.cast(tf.logical_not(is_correct), tf.int32))
+ correct_count = tf.Variable(0, False)
+ incorrect_count = tf.Variable(0, False)
+ correct_count_update = tf.assign_add(correct_count, correct)
+ incorrect_count_update = tf.assign_add(incorrect_count, incorrect)
+ accuracy_op = tf.cast(correct_count, tf.float32) / tf.cast(
+ correct_count + incorrect_count, tf.float32)
+ return [correct_count_update, incorrect_count_update], accuracy_op
+
+
+def check_dataset(dataset, mode):
+ """Validate we have a good dataset."""
+
+ names = [x['name'] for x in dataset.schema]
+ types = [x['type'] for x in dataset.schema]
+ if mode == 'train':
+ if (set(['image_url', 'label']) != set(names) or any (t != 'STRING' for t in types)):
+ raise ValueError('Invalid dataset. Expect only "image_url,label" STRING columns.')
+ else:
+ if ((set(['image_url']) != set(names) and set(['image_url', 'label']) != set(names)) or
+ any (t != 'STRING' for t in types)):
+ raise ValueError('Invalid dataset. Expect only "image_url" or "image_url,label" ' +
+ 'STRING columns.')
+
+
+def get_sources_from_dataset(p, dataset, mode):
+ """get pcollection from dataset."""
+
+ import apache_beam as beam
+ import csv
+ from datalab.ml import CsvDataSet, BigQueryDataSet
+
+ check_dataset(dataset, mode)
+ if type(dataset) is CsvDataSet:
+ source_list = []
+ for ii, input_path in enumerate(dataset.files):
+ source_list.append(p | 'Read from Csv %d (%s)' % (ii, mode) >>
+ beam.io.ReadFromText(input_path, strip_trailing_newlines=True))
+ return (source_list | 'Flatten Sources (%s)' % mode >> beam.Flatten()
+ | 'Create Dict from Csv (%s)' % mode >>
+ beam.Map(lambda line: csv.DictReader([line], fieldnames=['image_url', 'label']).next()))
+ elif type(dataset) is BigQueryDataSet:
+ bq_source = (beam.io.BigQuerySource(table=dataset.table) if dataset.table is not None else
+ beam.io.BigQuerySource(query=dataset.query))
+ return p | 'Read source from BigQuery (%s)' % mode >> beam.io.Read(bq_source)
+ else:
+ raise ValueError('Invalid DataSet. Expect CsvDataSet or BigQueryDataSet')
+
+
+def decode_and_resize(image_str_tensor):
+ """Decodes jpeg string, resizes it and returns a uint8 tensor."""
+
+ # These constants are set by Inception v3's expectations.
+ height = 299
+ width = 299
+ channels = 3
+
+ image = tf.image.decode_jpeg(image_str_tensor, channels=channels)
+ # Note resize expects a batch_size, but tf_map supresses that index,
+ # thus we have to expand then squeeze. Resize returns float32 in the
+ # range [0, uint8_max]
+ image = tf.expand_dims(image, 0)
+ image = tf.image.resize_bilinear(image, [height, width], align_corners=False)
+ image = tf.squeeze(image, squeeze_dims=[0])
+ image = tf.cast(image, dtype=tf.uint8)
+ return image
+
+
+def resize_image(image_str_tensor):
+ """Decodes jpeg string, resizes it and re-encode it to jpeg."""
+
+ image = decode_and_resize(image_str_tensor)
+ image = tf.image.encode_jpeg(image, quality=100)
+ return image
+
+
+def load_images(image_files, resize=True):
+ """Load images from files and optionally resize it."""
+
+ images = []
+ for image_file in image_files:
+ with open_local_or_gcs(image_file, 'r') as ff:
+ images.append(ff.read())
+ if resize is False:
+ return images
+
+ # To resize, run a tf session so we can reuse 'decode_and_resize()'
+ # which is used in prediction graph. This makes sure we don't lose
+ # any quality in prediction, while decreasing the size of the images
+ # submitted to the model over network.
+ image_str_tensor = tf.placeholder(tf.string, shape=[None])
+ image = tf.map_fn(resize_image, image_str_tensor, back_prop=False)
+ feed_dict = collections.defaultdict(list)
+ feed_dict[image_str_tensor.name] = images
+ with tf.Session() as sess:
+ images_resized = sess.run(image, feed_dict=feed_dict)
+ return images_resized
diff --git a/solutionbox/inception/datalab_solutions/inception/setup.py b/solutionbox/inception/datalab_solutions/inception/setup.py
new file mode 100644
index 000000000..93d72fcc9
--- /dev/null
+++ b/solutionbox/inception/datalab_solutions/inception/setup.py
@@ -0,0 +1,54 @@
+# Copyright 2017 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+
+# To publish to PyPi use: python setup.py bdist_wheel upload -r pypi
+
+import datetime
+from setuptools import setup
+
+minor = datetime.datetime.now().strftime("%y%m%d%H%M")
+version = '0.1'
+
+setup(
+ name='inception',
+ version=version,
+ packages=[
+ 'datalab_solutions',
+ 'datalab_solutions.inception',
+ ],
+
+ description='Google Cloud Datalab Inception Package',
+ author='Google',
+ author_email='google-cloud-datalab-feedback@googlegroups.com',
+ keywords=[
+ ],
+ license="Apache Software License",
+ classifiers=[
+ "Programming Language :: Python",
+ "Programming Language :: Python :: 2",
+ "Development Status :: 4 - Beta",
+ "Environment :: Other Environment",
+ "Intended Audience :: Developers",
+ "License :: OSI Approved :: Apache Software License",
+ "Operating System :: OS Independent",
+ "Topic :: Software Development :: Libraries :: Python Modules"
+ ],
+ long_description="""
+ """,
+ install_requires=[
+ 'tensorflow==1.0',
+ 'protobuf==3.1.0',
+ 'google-cloud-dataflow==0.5.5',
+ ],
+ package_data={
+ }
+)
diff --git a/solutionbox/inception/datalab_solutions/inception/task.py b/solutionbox/inception/datalab_solutions/inception/task.py
new file mode 100644
index 000000000..73434118b
--- /dev/null
+++ b/solutionbox/inception/datalab_solutions/inception/task.py
@@ -0,0 +1,84 @@
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""Entry point for CloudML training.
+
+ CloudML training requires a tarball package and a python module to run. This file
+ provides such a "main" method and a list of args passed with the program.
+"""
+
+import argparse
+import json
+import logging
+import os
+import tensorflow as tf
+
+from . import _model
+from . import _trainer
+from . import _util
+
+
+def main(_):
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--input_dir',
+ type=str,
+ help='The input dir path for training and evaluation data.')
+ parser.add_argument(
+ '--output_path',
+ type=str,
+ help='The path to which checkpoints and other outputs '
+ 'should be saved. This can be either a local or GCS '
+ 'path.')
+ parser.add_argument(
+ '--max_steps',
+ type=int,)
+ parser.add_argument(
+ '--batch_size',
+ type=int,
+ help='Number of examples to be processed per mini-batch.')
+ parser.add_argument(
+ '--checkpoint',
+ type=str,
+ default=_util._DEFAULT_CHECKPOINT_GSURL,
+ help='Pretrained inception checkpoint path.')
+
+ args, _ = parser.parse_known_args()
+ labels = _util.get_labels(args.input_dir)
+ model = _model.Model(labels, 0.5, args.checkpoint)
+
+ env = json.loads(os.environ.get('TF_CONFIG', '{}'))
+ # Print the job data as provided by the service.
+ logging.info('Original job data: %s', env.get('job', {}))
+ task_data = env.get('task', None) or {'type': 'master', 'index': 0}
+ task = type('TaskSpec', (object,), task_data)
+ trial = task_data.get('trial')
+ if trial is not None:
+ args.output_path = os.path.join(args.output_path, trial)
+
+ cluster_data = env.get('cluster', None)
+ cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None
+ if not cluster or not task or task.type == 'master' or task.type == 'worker':
+ _trainer.Trainer(args.input_dir, args.batch_size, args.max_steps,
+ args.output_path, model, cluster, task).run_training()
+ elif task.type == 'ps':
+ server = _trainer.start_server(cluster, task)
+ server.join()
+ else:
+ raise ValueError('invalid task_type %s' % (task.type,))
+
+if __name__ == '__main__':
+ logging.basicConfig(level=logging.INFO)
+ tf.app.run()
diff --git a/solutionbox/inception/setup.py b/solutionbox/inception/setup.py
new file mode 100644
index 000000000..93d72fcc9
--- /dev/null
+++ b/solutionbox/inception/setup.py
@@ -0,0 +1,54 @@
+# Copyright 2017 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+
+# To publish to PyPi use: python setup.py bdist_wheel upload -r pypi
+
+import datetime
+from setuptools import setup
+
+minor = datetime.datetime.now().strftime("%y%m%d%H%M")
+version = '0.1'
+
+setup(
+ name='inception',
+ version=version,
+ packages=[
+ 'datalab_solutions',
+ 'datalab_solutions.inception',
+ ],
+
+ description='Google Cloud Datalab Inception Package',
+ author='Google',
+ author_email='google-cloud-datalab-feedback@googlegroups.com',
+ keywords=[
+ ],
+ license="Apache Software License",
+ classifiers=[
+ "Programming Language :: Python",
+ "Programming Language :: Python :: 2",
+ "Development Status :: 4 - Beta",
+ "Environment :: Other Environment",
+ "Intended Audience :: Developers",
+ "License :: OSI Approved :: Apache Software License",
+ "Operating System :: OS Independent",
+ "Topic :: Software Development :: Libraries :: Python Modules"
+ ],
+ long_description="""
+ """,
+ install_requires=[
+ 'tensorflow==1.0',
+ 'protobuf==3.1.0',
+ 'google-cloud-dataflow==0.5.5',
+ ],
+ package_data={
+ }
+)
diff --git a/solutionbox/structured_data/build.sh b/solutionbox/structured_data/build.sh
new file mode 100755
index 000000000..7946fbf79
--- /dev/null
+++ b/solutionbox/structured_data/build.sh
@@ -0,0 +1,8 @@
+#! /bin/bash
+
+
+rm -fr dist
+cp setup.py datalab_solutions/structured_data/master_setup.py
+python setup.py sdist
+
+
diff --git a/solutionbox/structured_data/datalab_solutions/__init__.py b/solutionbox/structured_data/datalab_solutions/__init__.py
new file mode 100644
index 000000000..3d74130ef
--- /dev/null
+++ b/solutionbox/structured_data/datalab_solutions/__init__.py
@@ -0,0 +1,12 @@
+# Copyright 2017 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+
diff --git a/solutionbox/structured_data/datalab_solutions/structured_data/__init__.py b/solutionbox/structured_data/datalab_solutions/structured_data/__init__.py
new file mode 100644
index 000000000..76a12ce46
--- /dev/null
+++ b/solutionbox/structured_data/datalab_solutions/structured_data/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2017 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+
+
+from ._package import local_preprocess, cloud_preprocess, local_train, cloud_train, local_predict, \
+ cloud_predict, local_batch_predict, cloud_batch_predict
+
+# Source of truth for the version of this package.
+__version__ = '0.0.1'
\ No newline at end of file
diff --git a/solutionbox/structured_data/datalab_solutions/structured_data/_package.py b/solutionbox/structured_data/datalab_solutions/structured_data/_package.py
new file mode 100644
index 000000000..5b33eea8b
--- /dev/null
+++ b/solutionbox/structured_data/datalab_solutions/structured_data/_package.py
@@ -0,0 +1,586 @@
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""Provides interface for Datalab.
+
+ Datalab will look for functions with the below names:
+ local_preprocess
+ local_train
+ local_predict
+ cloud_preprocess
+ cloud_train
+ cloud_predict
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import datetime
+import logging
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+import urllib
+import json
+import glob
+import StringIO
+import subprocess
+
+import pandas as pd
+import tensorflow as tf
+
+from tensorflow.python.lib.io import file_io
+
+from . import preprocess
+from . import trainer
+from . import predict
+
+
+def _default_project():
+ import datalab.context
+ context = datalab.context.Context.default()
+ return context.project_id
+
+def _is_in_IPython():
+ try:
+ import IPython
+ return True
+ except ImportError:
+ return False
+
+def _assert_gcs_files(files):
+ """Check files starts wtih gs://.
+
+ Args:
+ files: string to file path, or list of file paths.
+ """
+ if isinstance(files, basestring):
+ files = [files]
+
+ for f in files:
+ if f is not None and not f.startswith('gs://'):
+ raise ValueError('File %s is not a gcs path' % f)
+
+
+def _package_to_staging(staging_package_url):
+ """Repackage this package from local installed location and copy it to GCS.
+
+ Args:
+ staging_package_url: GCS path.
+ """
+ import datalab.ml as ml
+
+ # Find the package root. __file__ is under [package_root]/datalab_solutions/inception.
+ package_root = os.path.abspath(
+ os.path.join(os.path.dirname(__file__), '../../'))
+ setup_path = os.path.abspath(
+ os.path.join(os.path.dirname(__file__), 'master_setup.py'))
+ tar_gz_path = os.path.join(staging_package_url, 'staging', 'sd.tar.gz')
+
+ print('Building package and uploading to %s' % tar_gz_path)
+ ml.package_and_copy(package_root, setup_path, tar_gz_path)
+
+ return tar_gz_path
+
+
+def local_preprocess(output_dir, dataset):
+ """Preprocess data locally with Pandas
+
+ Produce analysis used by training.
+
+ Args:
+ output_dir: The output directory to use.
+ dataset: only CsvDataSet is supported currently.
+ """
+ import datalab.ml as ml
+ if not isinstance(dataset, ml.CsvDataSet):
+ raise ValueError('Only CsvDataSet is supported')
+
+ if len(dataset.input_files) != 1:
+ raise ValueError('CsvDataSet should be built with a file pattern, not a '
+ 'list of files.')
+
+ # Write schema to a file.
+ tmp_dir = tempfile.mkdtemp()
+ _, schema_file_path = tempfile.mkstemp(dir=tmp_dir, suffix='.json',
+ prefix='schema')
+ try:
+ file_io.write_string_to_file(schema_file_path, json.dumps(dataset.schema))
+
+ args = ['local_preprocess',
+ '--input_file_pattern=%s' % dataset.input_files[0],
+ '--output_dir=%s' % output_dir,
+ '--schema_file=%s' % schema_file_path]
+
+ print('Starting local preprocessing.')
+ preprocess.local_preprocess.main(args)
+ print('Local preprocessing done.')
+ finally:
+ shutil.rmtree(tmp_dir)
+
+def cloud_preprocess(output_dir, dataset, project_id=None):
+ """Preprocess data in the cloud with BigQuery.
+
+ Produce analysis used by training. This can take a while, even for small
+ datasets. For small datasets, it may be faster to use local_preprocess.
+
+ Args:
+ output_dir: The output directory to use.
+ dataset: only CsvDataSet is supported currently.
+ project_id: project id the table is in. If none, uses the default project.
+ """
+ import datalab.ml as ml
+ if not isinstance(dataset, ml.CsvDataSet):
+ raise ValueError('Only CsvDataSet is supported')
+
+ if len(dataset.input_files) != 1:
+ raise ValueError('CsvDataSet should be built with a file pattern, not a '
+ 'list of files.')
+
+ _assert_gcs_files([output_dir, dataset.input_files[0]])
+
+ # Write schema to a file.
+ tmp_dir = tempfile.mkdtemp()
+ _, schema_file_path = tempfile.mkstemp(dir=tmp_dir, suffix='.json',
+ prefix='schema')
+ try:
+ file_io.write_string_to_file(schema_file_path, json.dumps(dataset.schema))
+
+ args = ['cloud_preprocess',
+ '--input_file_pattern=%s' % dataset.input_files[0],
+ '--output_dir=%s' % output_dir,
+ '--schema_file=%s' % schema_file_path]
+
+
+ print('Starting cloud preprocessing.')
+ print('Track BigQuery status at')
+ print('https://bigquery.cloud.google.com/queries/%s' % _default_project())
+ preprocess.cloud_preprocess.main(args)
+ print('Cloud preprocessing done.')
+ finally:
+ shutil.rmtree(tmp_dir)
+
+
+def local_train(train_dataset,
+ eval_dataset,
+ preprocess_output_dir,
+ output_dir,
+ transforms,
+ model_type,
+ max_steps=5000,
+ num_epochs=None,
+ train_batch_size=100,
+ eval_batch_size=100,
+ min_eval_frequency=100,
+ top_n=None,
+ layer_sizes=None,
+ learning_rate=0.01,
+ epsilon=0.0005):
+ """Train model locally.
+ Args:
+ train_dataset: CsvDataSet
+ eval_dataset: CsvDataSet
+ preprocess_output_dir: The output directory from preprocessing
+ output_dir: Output directory of training.
+ transforms: file path or transform object. Example:
+ {
+ "col_A": {"transform": "scale", "default": 0.0},
+ "col_B": {"transform": "scale","value": 4},
+ # Note col_C is missing, so default transform used.
+ "col_D": {"transform": "hash_one_hot", "hash_bucket_size": 4},
+ "col_target": {"transform": "target"},
+ "col_key": {"transform": "key"}
+ }
+ The keys correspond to the columns in the input files as defined by the
+ schema file during preprocessing. Some notes
+ 1) The "key" and "target" transforms are required.
+ 2) Default values are optional. These are used if the input data has
+ missing values during training and prediction. If not supplied for a
+ column, the default value for a numerical column is that column's
+ mean vlaue, and for a categorical column the empty string is used.
+ 3) For numerical colums, the following transforms are supported:
+ i) {"transform": "identity"}: does nothing to the number. (default)
+ ii) {"transform": "scale"}: scales the colum values to -1, 1.
+ iii) {"transform": "scale", "value": a}: scales the colum values
+ to -a, a.
+
+ For categorical colums, the transform supported depends on if the
+ model is a linear or DNN model because tf.layers is uesed.
+ For a linear model, the transforms supported are:
+ i) {"transform": "sparse"}: Makes a sparse vector using the full
+ vocabulary associated with the column (default).
+ ii) {"transform": "hash_sparse", "hash_bucket_size": n}: First each
+ string is hashed to an integer in the range [0, n), and then a
+ sparse vector is used.
+
+ For a DNN model, the categorical transforms that are supported are:
+ i) {"transform": "one_hot"}: A one-hot vector using the full
+ vocabulary is used. (default)
+ ii) {"transform": "embedding", "embedding_dim": d}: Each label is
+ embedded into an d-dimensional space.
+ iii) {"transform": "hash_one_hot", "hash_bucket_size": n}: The label
+ is first hashed into the range [0, n) and then a one-hot encoding
+ is made.
+ iv) {"transform": "hash_embedding", "hash_bucket_size": n,
+ "embedding_dim": d}: First each label is hashed to [0, n), and
+ then each integer is embedded into a d-dimensional space.
+ model_type: One of linear_classification, linear_regression,
+ dnn_classification, dnn_regression.
+ max_steps: Int. Number of training steps to perform.
+ num_epochs: Maximum number of training data epochs on which to train.
+ The training job will run for max_steps or num_epochs, whichever occurs
+ first.
+ train_batch_size: number of rows to train on in one step.
+ eval_batch_size: number of rows to eval in one step.
+ min_eval_frequency: Minimum number of training steps between evaluations.
+ top_n: Int. For classification problems, the output graph will contain the
+ labels and scores for the top n classes with a default of n=1. Use
+ None for regression problems.
+ layer_sizes: List. Represents the layers in the connected DNN.
+ If the model type is DNN, this must be set. Example [10, 3, 2], this
+ will create three DNN layers where the first layer will have 10 nodes,
+ the middle layer will have 3 nodes, and the laster layer will have 2
+ nodes.
+ learning_rate: tf.train.AdamOptimizer's learning rate,
+ epsilon: tf.train.AdamOptimizer's epsilon value.
+ """
+ if len(train_dataset.input_files) != 1 or len(eval_dataset.input_files) != 1:
+ raise ValueError('CsvDataSets must be built with a file pattern, not list '
+ 'of files.')
+
+ if file_io.file_exists(output_dir):
+ raise ValueError('output_dir already exist. Use a new output path.')
+
+ if isinstance(transforms, dict):
+ # Make a transforms file.
+ if not file_io.file_exists(output_dir):
+ file_io.recursive_create_dir(output_dir)
+ transforms_file = os.path.join(output_dir, 'transforms_file.json')
+ file_io.write_string_to_file(
+ transforms_file,
+ json.dumps(transforms))
+ else:
+ transforms_file = transforms
+
+ args = ['local_train',
+ '--train_data_paths=%s' % train_dataset.input_files[0],
+ '--eval_data_paths=%s' % eval_dataset.input_files[0],
+ '--output_path=%s' % output_dir,
+ '--preprocess_output_dir=%s' % preprocess_output_dir,
+ '--transforms_file=%s' % transforms_file,
+ '--model_type=%s' % model_type,
+ '--max_steps=%s' % str(max_steps),
+ '--train_batch_size=%s' % str(train_batch_size),
+ '--eval_batch_size=%s' % str(eval_batch_size),
+ '--min_eval_frequency=%s' % str(min_eval_frequency),
+ '--learning_rate=%s' % str(learning_rate),
+ '--epsilon=%s' % str(epsilon)]
+ if num_epochs:
+ args.append('--num_epochs=%s' % str(num_epochs))
+ if top_n:
+ args.append('--top_n=%s' % str(top_n))
+ if layer_sizes:
+ for i in range(len(layer_sizes)):
+ args.append('--layer_size%s=%s' % (i+1, str(layer_sizes[i])))
+
+ stderr = sys.stderr
+ sys.stderr = sys.stdout
+ print('Starting local training.')
+ trainer.task.main(args)
+ print('Local training done.')
+ sys.stderr = stderr
+
+def cloud_train(train_dataset,
+ eval_dataset,
+ preprocess_output_dir,
+ output_dir,
+ transforms,
+ model_type,
+ cloud_training_config,
+ max_steps=5000,
+ num_epochs=None,
+ train_batch_size=100,
+ eval_batch_size=100,
+ min_eval_frequency=100,
+ top_n=None,
+ layer_sizes=None,
+ learning_rate=0.01,
+ epsilon=0.0005,
+ job_name=None):
+ """Train model using CloudML.
+
+ See local_train() for a description of the args.
+ Args:
+ cloud_training_config: A CloudTrainingConfig object.
+ job_name: Training job name. A default will be picked if None.
+ """
+ import datalab
+
+ if len(train_dataset.input_files) != 1 or len(eval_dataset.input_files) != 1:
+ raise ValueError('CsvDataSets must be built with a file pattern, not list '
+ 'of files.')
+
+ if file_io.file_exists(output_dir):
+ raise ValueError('output_dir already exist. Use a new output path.')
+
+ if isinstance(transforms, dict):
+ # Make a transforms file.
+ if not file_io.file_exists(output_dir):
+ file_io.recursive_create_dir(output_dir)
+ transforms_file = os.path.join(output_dir, 'transforms_file.json')
+ file_io.write_string_to_file(
+ transforms_file,
+ json.dumps(transforms))
+ else:
+ transforms_file = transforms
+
+ _assert_gcs_files([output_dir, train_dataset.input_files[0],
+ eval_dataset.input_files[0], transforms_file,
+ preprocess_output_dir])
+
+ args = ['--train_data_paths=%s' % train_dataset.input_files[0],
+ '--eval_data_paths=%s' % eval_dataset.input_files[0],
+ '--output_path=%s' % output_dir,
+ '--preprocess_output_dir=%s' % preprocess_output_dir,
+ '--transforms_file=%s' % transforms_file,
+ '--model_type=%s' % model_type,
+ '--max_steps=%s' % str(max_steps),
+ '--train_batch_size=%s' % str(train_batch_size),
+ '--eval_batch_size=%s' % str(eval_batch_size),
+ '--min_eval_frequency=%s' % str(min_eval_frequency),
+ '--learning_rate=%s' % str(learning_rate),
+ '--epsilon=%s' % str(epsilon)]
+ if num_epochs:
+ args.append('--num_epochs=%s' % str(num_epochs))
+ if top_n:
+ args.append('--top_n=%s' % str(top_n))
+ if layer_sizes:
+ for i in range(len(layer_sizes)):
+ args.append('--layer_size%s=%s' % (i+1, str(layer_sizes[i])))
+
+ job_request = {
+ 'package_uris': [_package_to_staging(output_dir)],
+ 'python_module': 'datalab_solutions.structured_data.trainer.task',
+ 'args': args
+ }
+ job_request.update(dict(cloud_training_config._asdict()))
+
+ if not job_name:
+ job_name = 'structured_data_train_' + datetime.datetime.now().strftime('%y%m%d_%H%M%S')
+ job = datalab.ml.Job.submit_training(job_request, job_name)
+ print('Job request send. View status of job at')
+ print('https://console.developers.google.com/ml/jobs?project=%s' %
+ _default_project())
+
+ return job
+
+
+def local_predict(training_ouput_dir, data):
+ """Runs local prediction on the prediction graph.
+
+ Runs local prediction and returns the result in a Pandas DataFrame. For
+ running prediction on a large dataset or saving the results, run
+ local_batch_prediction or batch_prediction. Input data should fully match
+ the schema that was used at training, except the target column should not
+ exist.
+
+ Args:
+ training_ouput_dir: local path to the trained output folder.
+ data: List of csv strings or a Pandas DataFrame that match the model schema.
+
+ """
+ # Save the instances to a file, call local batch prediction, and print it back
+ tmp_dir = tempfile.mkdtemp()
+ _, input_file_path = tempfile.mkstemp(dir=tmp_dir, suffix='.csv',
+ prefix='input')
+
+ try:
+ if isinstance(data, pd.DataFrame):
+ data.to_csv(input_file_path, header=False, index=False)
+ else:
+ with open(input_file_path, 'w') as f:
+ for line in data:
+ f.write(line + '\n')
+
+ model_dir = os.path.join(training_ouput_dir, 'model')
+ if not file_io.file_exists(model_dir):
+ raise ValueError('training_ouput_dir should contain the folder model')
+
+ cmd = ['predict.py',
+ '--predict_data=%s' % input_file_path,
+ '--trained_model_dir=%s' % model_dir,
+ '--output_dir=%s' % tmp_dir,
+ '--output_format=csv',
+ '--batch_size=100',
+ '--mode=prediction',
+ '--no-shard_files']
+
+ print('Starting local prediction.')
+ predict.predict.main(cmd)
+ print('Local prediction done.')
+
+ # Read the header file.
+ schema_file = os.path.join(tmp_dir, 'csv_schema.json')
+ with open(schema_file, 'r') as f:
+ schema = json.loads(f.read())
+
+ # Print any errors to the screen.
+ errors_file = glob.glob(os.path.join(tmp_dir, 'errors*'))
+ if errors_file and os.path.getsize(errors_file[0]) > 0:
+ print('Warning: there are errors. See below:')
+ with open(errors_file[0], 'r') as f:
+ text = f.read()
+ print(text)
+
+ # Read the predictions data.
+ prediction_file = glob.glob(os.path.join(tmp_dir, 'predictions*'))
+ if not prediction_file:
+ raise FileNotFoundError('Prediction results not found')
+ predictions = pd.read_csv(prediction_file[0],
+ header=None,
+ names=[col['name'] for col in schema])
+ return predictions
+ finally:
+ shutil.rmtree(tmp_dir)
+
+
+def cloud_predict(model_name, model_version, data):
+ """Use Online prediction.
+
+ Runs online prediction in the cloud and prints the results to the screen. For
+ running prediction on a large dataset or saving the results, run
+ local_batch_prediction or batch_prediction.
+
+ Args:
+ model_name: deployed model name
+ model_version: depoyed model version
+ data: List of csv strings or a Pandas DataFrame that match the model schema.
+
+ Before using this, the model must be created. This can be done by running
+ two gcloud commands:
+ 1) gcloud beta ml models create NAME
+ 2) gcloud beta ml versions create VERSION --model NAME \
+ --origin gs://BUCKET/training_output_dir/model
+ or these datalab commands:
+ 1) import datalab
+ model = datalab.ml.ModelVersions(MODEL_NAME)
+ model.deploy(version_name=VERSION,
+ path='gs://BUCKET/training_output_dir/model')
+ Note that the model must be on GCS.
+ """
+ import datalab.ml as ml
+
+
+ if isinstance(data, pd.DataFrame):
+ # write the df to csv.
+ string_buffer = StringIO.StringIO()
+ data.to_csv(string_buffer, header=None, index=False)
+ input_data = string_buffer.getvalue().split('\n')
+
+ #remove empty strings
+ input_data = [line for line in input_data if line]
+ else:
+ input_data = data
+
+ predictions = ml.ModelVersions(model_name).predict(model_version, input_data)
+
+ # Convert predictions into a dataframe
+ df = pd.DataFrame(columns=sorted(predictions[0].keys()))
+ for i in range(len(predictions)):
+ for k, v in predictions[i].iteritems():
+ df.loc[i, k] = v
+ return df
+
+
+def local_batch_predict(training_ouput_dir, prediction_input_file, output_dir,
+ mode,
+ batch_size=1000, shard_files=True, output_format='csv'):
+ """Local batch prediction.
+
+ Args:
+ training_ouput_dir: The output folder of training.
+ prediction_input_file: csv file pattern to a local file.
+ output_dir: output location to save the results.
+ mode: 'evaluation' or 'prediction'. If 'evaluation', the input data must
+ contain a target column. If 'prediction', the input data must not
+ contain a target column.
+ batch_size: Int. How many instances to run in memory at once. Larger values
+ mean better performace but more memeory consumed.
+ shard_files: If false, the output files are not shardded.
+ output_format: csv or json. Json file are json-newlined.
+ """
+
+ if mode == 'evaluation':
+ model_dir = os.path.join(training_ouput_dir, 'evaluation_model')
+ elif mode == 'prediction':
+ model_dir = os.path.join(training_ouput_dir, 'model')
+ else:
+ raise ValueError('mode must be evaluation or prediction')
+
+ if not file_io.file_exists(model_dir):
+ raise ValueError('Model folder %s does not exist' % model_dir)
+
+ cmd = ['predict.py',
+ '--predict_data=%s' % prediction_input_file,
+ '--trained_model_dir=%s' % model_dir,
+ '--output_dir=%s' % output_dir,
+ '--output_format=%s' % output_format,
+ '--batch_size=%s' % str(batch_size),
+ '--shard_files' if shard_files else '--no-shard_files',
+ '--has_target' if mode == 'evaluation' else '--no-has_target'
+ ]
+
+ print('Starting local batch prediction.')
+ predict.predict.main(cmd)
+ print('Local batch prediction done.')
+
+
+
+def cloud_batch_predict(training_ouput_dir, prediction_input_file, output_dir,
+ mode,
+ batch_size=1000, shard_files=True, output_format='csv'):
+ """Cloud batch prediction. Submitts a Dataflow job.
+
+ See local_batch_predict() for a description of the args.
+ """
+ if mode == 'evaluation':
+ model_dir = os.path.join(training_ouput_dir, 'evaluation_model')
+ elif mode == 'prediction':
+ model_dir = os.path.join(training_ouput_dir, 'model')
+ else:
+ raise ValueError('mode must be evaluation or prediction')
+
+ if not file_io.file_exists(model_dir):
+ raise ValueError('Model folder %s does not exist' % model_dir)
+
+ _assert_gcs_files([training_ouput_dir, prediction_input_file,
+ output_dir])
+
+ cmd = ['predict.py',
+ '--cloud',
+ '--project_id=%s' % _default_project(),
+ '--predict_data=%s' % prediction_input_file,
+ '--trained_model_dir=%s' % model_dir,
+ '--output_dir=%s' % output_dir,
+ '--output_format=%s' % output_format,
+ '--batch_size=%s' % str(batch_size),
+ '--shard_files' if shard_files else '--no-shard_files',
+ '--extra_package=%s' % _package_to_staging(output_dir)]
+
+ print('Starting cloud batch prediction.')
+ predict.predict.main(cmd)
+ print('See above link for job status.')
diff --git a/solutionbox/structured_data/datalab_solutions/structured_data/master_setup.py b/solutionbox/structured_data/datalab_solutions/structured_data/master_setup.py
new file mode 100644
index 000000000..aee640276
--- /dev/null
+++ b/solutionbox/structured_data/datalab_solutions/structured_data/master_setup.py
@@ -0,0 +1,73 @@
+# Copyright 2017 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+
+# A copy of this file must be made in datalab_solutions/structured_data/setup.py
+
+import datetime
+import os
+import re
+from setuptools import setup
+
+
+
+# The version is saved in an __init__ file.
+def get_version():
+ VERSIONFILE = os.path.join('datalab_solutions/structured_data/',
+ '__init__.py')
+ if not os.path.isfile(VERSIONFILE):
+ raise ValueError('setup.py: File not found %s' % VERSIONFILE)
+ initfile_lines = open(VERSIONFILE, 'rt').readlines()
+ VSRE = r"^__version__ = ['\"]([^'\"]*)['\"]"
+ for line in initfile_lines:
+ mo = re.search(VSRE, line, re.M)
+ if mo:
+ return mo.group(1)
+ raise RuntimeError('Unable to find version string in %s.' % (VERSIONFILE,))
+
+
+setup(
+ name='structured_data',
+ version=get_version(),
+ packages=[
+ 'datalab_solutions',
+ 'datalab_solutions.structured_data',
+ 'datalab_solutions.structured_data.trainer',
+ 'datalab_solutions.structured_data.preprocess',
+ 'datalab_solutions.structured_data.predict',
+ ],
+ description='Google Cloud Datalab Structured Data Package',
+ author='Google',
+ author_email='google-cloud-datalab-feedback@googlegroups.com',
+ keywords=[
+ ],
+ license="Apache Software License",
+ classifiers=[
+ "Programming Language :: Python",
+ "Programming Language :: Python :: 2",
+ "Development Status :: 4 - Beta",
+ "Environment :: Other Environment",
+ "Intended Audience :: Developers",
+ "License :: OSI Approved :: Apache Software License",
+ "Operating System :: OS Independent",
+ "Topic :: Software Development :: Libraries :: Python Modules"
+ ],
+ long_description="""
+ """,
+ install_requires=[
+ 'tensorflow==1.0',
+ 'protobuf==3.1.0',
+ 'google-cloud-dataflow==0.5.5'
+ ],
+ package_data={
+ },
+ data_files=[],
+)
diff --git a/solutionbox/structured_data/datalab_solutions/structured_data/predict/__init__.py b/solutionbox/structured_data/datalab_solutions/structured_data/predict/__init__.py
new file mode 100644
index 000000000..b1c31965d
--- /dev/null
+++ b/solutionbox/structured_data/datalab_solutions/structured_data/predict/__init__.py
@@ -0,0 +1,16 @@
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import predict
+
diff --git a/solutionbox/structured_data/datalab_solutions/structured_data/predict/predict.py b/solutionbox/structured_data/datalab_solutions/structured_data/predict/predict.py
new file mode 100644
index 000000000..458d5e677
--- /dev/null
+++ b/solutionbox/structured_data/datalab_solutions/structured_data/predict/predict.py
@@ -0,0 +1,415 @@
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Runs prediction on a trained model."""
+
+
+import argparse
+import datetime
+import os
+import sys
+
+import apache_beam as beam
+
+
+def parse_arguments(argv):
+ """Parse command line arguments.
+
+ Args:
+ argv: includes the script's name.
+
+ Returns:
+ argparse object
+ """
+ parser = argparse.ArgumentParser(
+ description='Runs Prediction inside a beam or Dataflow job.')
+ # cloud options
+ parser.add_argument('--project_id',
+ help='The project to which the job will be submitted.')
+ parser.add_argument('--cloud',
+ action='store_true',
+ help='Run preprocessing on the cloud.')
+ parser.add_argument('--job_name',
+ default=('structured-data-batch-prediction-'
+ + datetime.datetime.now().strftime('%Y%m%d%H%M%S')),
+ help='Dataflow job name. Must be unique over all jobs.')
+ parser.add_argument('--extra_package',
+ default=[],
+ action='append',
+ help=('If using --cloud, also installs these packages on '
+ 'each dataflow worker'))
+
+ # I/O args
+ parser.add_argument('--predict_data',
+ required=True,
+ help='Data to run prediction on')
+ parser.add_argument('--trained_model_dir',
+ required=True,
+ help='Usually train_output_path/model.')
+ parser.add_argument('--output_dir',
+ required=True,
+ help=('Location to save output.'))
+
+ # Other args
+ parser.add_argument('--batch_size',
+ required=False,
+ default=1000,
+ type=int,
+ help=('Batch size. Larger values consumes more memrory '
+ 'but takes less time to finish.'))
+ parser.add_argument('--shard_files',
+ dest='shard_files',
+ action='store_true',
+ help='Shard files')
+ parser.add_argument('--no-shard_files',
+ dest='shard_files',
+ action='store_false',
+ help='Don\'t shard files')
+ parser.set_defaults(shard_files=True)
+
+ parser.add_argument('--output_format',
+ choices=['csv', 'json'],
+ default='csv',
+ help="""
+ The output results.
+ raw_json: produces a newline file where each line is json. No
+ post processing is performed and the output matches what the trained
+ model produces.
+ csv: produces a csv file without a header row and a header csv file.
+ For classification problems, the vector of probabalities for each
+ target class is split into individual csv columns.""")
+
+ args, _ = parser.parse_known_args(args=argv[1:])
+
+ if args.cloud:
+ if not args.project_id:
+ raise ValueError('--project_id needed with --cloud')
+ if not args.trained_model_dir.startswith('gs://'):
+ raise ValueError('trained_model_dir needs to be a GCS path,')
+ if not args.output_dir.startswith('gs://'):
+ raise ValueError('output_dir needs to be a GCS path.')
+ if not args.predict_data.startswith('gs://'):
+ raise ValueError('predict_data needs to be a GCS path.')
+
+
+ return args
+
+
+class EmitAsBatchDoFn(beam.DoFn):
+ """A DoFn that buffers the records and emits them batch by batch."""
+
+ def __init__(self, batch_size):
+ """Constructor of EmitAsBatchDoFn beam.DoFn class.
+
+ Args:
+ batch_size: the max size we want to buffer the records before emitting.
+ """
+ self._batch_size = batch_size
+ self._cached = []
+
+ def process(self, element):
+ self._cached.append(element)
+ if len(self._cached) >= self._batch_size:
+ emit = self._cached
+ self._cached = []
+ yield emit
+
+ def finish_bundle(self, element=None):
+ if len(self._cached) > 0: # pylint: disable=g-explicit-length-test
+ yield self._cached
+
+
+class RunGraphDoFn(beam.DoFn):
+ """A DoFn for running the TF graph."""
+
+ def __init__(self, trained_model_dir):
+ self._trained_model_dir = trained_model_dir
+ self._session = None
+
+ def start_bundle(self, element=None):
+ from tensorflow.python.saved_model import tag_constants
+ from tensorflow.contrib.session_bundle import bundle_shim
+ import json
+
+ self._session, meta_graph = bundle_shim.load_session_bundle_or_saved_model_bundle_from_path(self._trained_model_dir, tags=[tag_constants.SERVING])
+ signature = meta_graph.signature_def['serving_default']
+
+ # get the mappings between aliases and tensor names
+ # for both inputs and outputs
+ self._input_alias_map = {friendly_name: tensor_info_proto.name
+ for (friendly_name, tensor_info_proto) in signature.inputs.items() }
+ self._output_alias_map = {friendly_name: tensor_info_proto.name
+ for (friendly_name, tensor_info_proto) in signature.outputs.items() }
+ self._aliases, self._tensor_names = zip(*self._output_alias_map.items())
+
+
+ def finish_bundle(self, element=None):
+ self._session.close()
+
+
+ def process(self, element):
+ """Run batch prediciton on a TF graph.
+
+ Args:
+ element: list of strings, representing one batch input to the TF graph.
+ """
+ import collections
+ import apache_beam as beam
+
+ num_in_batch = 0
+ try:
+ assert self._session is not None
+
+ feed_dict = collections.defaultdict(list)
+ for line in element:
+
+ # Remove trailing newline.
+ if line.endswith('\n'):
+ line = line[:-1]
+
+ feed_dict[self._input_alias_map.values()[0]].append(line)
+ num_in_batch += 1
+
+ # batch_result is list of numpy arrays with batch_size many rows.
+ batch_result = self._session.run(fetches=self._tensor_names,
+ feed_dict=feed_dict)
+
+ # ex batch_result for batch_size > 1:
+ # (array([value1, value2, ..., value_batch_size]),
+ # array([[a1, b1, c1]], ..., [a_batch_size, b_batch_size, c_batch_size]]),
+ # ...)
+ # ex batch_result for batch_size == 1:
+ # (value,
+ # array([a1, b1, c1]),
+ # ...)
+
+ # Convert the results into a dict and unbatch the results.
+ if num_in_batch > 1:
+ for result in zip(*batch_result):
+ predictions = {}
+ for name, value in zip(self._aliases, result):
+ predictions[name] = (value.tolist() if getattr(value, 'tolist', None)
+ else value)
+ yield predictions
+ else:
+ predictions = {}
+ for i in range(len(self._aliases)):
+ value = batch_result[i]
+ value = (value.tolist() if getattr(value, 'tolist', None)
+ else value)
+ predictions[self._aliases[i]] = value
+ yield predictions
+
+ except Exception as e: # pylint: disable=broad-except
+ yield beam.pvalue.SideOutputValue('errors',
+ (str(e), element))
+
+
+class RawJsonCoder(beam.coders.Coder):
+ """Coder for json newline files."""
+
+ def encode(self, obj):
+ """Encodes a python object into a JSON string.
+
+ Args:
+ obj: python object.
+
+ Returns:
+ JSON string.
+ """
+ import json
+ return json.dumps(obj, separators=(',', ': '))
+
+
+class CSVCoder(beam.coders.Coder):
+ """Coder for CSV files containing the ouput of prediction."""
+
+ def __init__(self, header):
+ """Sets the headers in the csv file.
+
+ Args:
+ header: list of strings that correspond to keys in the predictions dict.
+ """
+ self._header = header
+
+ def make_header_string(self):
+ return ','.join(self._header)
+
+ def encode(self, tf_graph_predictions):
+ """Encodes the graph json prediction into csv.
+
+ Args:
+ tf_graph_predictions: python dict.
+
+ Returns:
+ csv string.
+ """
+ row = []
+ for col in self._header:
+ row.append(str(tf_graph_predictions[col]))
+
+ return ','.join(row)
+
+
+class FormatAndSave(beam.PTransform):
+
+ def __init__(self, args):
+ self._shard_name_template = None if args.shard_files else ''
+ self._output_format = args.output_format
+ self._output_dir = args.output_dir
+
+ # Get the BQ schema if csv.
+ if self._output_format == 'csv':
+ from tensorflow.python.saved_model import tag_constants
+ from tensorflow.contrib.session_bundle import bundle_shim
+ from tensorflow.core.framework import types_pb2
+
+ session, meta_graph = bundle_shim.load_session_bundle_or_saved_model_bundle_from_path(args.trained_model_dir, tags=[tag_constants.SERVING])
+ signature = meta_graph.signature_def['serving_default']
+
+ self._schema = []
+ for friendly_name in sorted(signature.outputs):
+ tensor_info_proto = signature.outputs[friendly_name]
+
+ # TODO(brandondutra): Could dtype be DT_INVALID?
+ # Consider getting the dtype from the graph via
+ # session.graph.get_tensor_by_name(tensor_info_proto.name).dtype)
+ dtype = tensor_info_proto.dtype
+ if dtype == types_pb2.DT_FLOAT or dtype == types_pb2.DT_DOUBLE:
+ bq_type = 'FLOAT'
+ elif dtype == types_pb2.DT_INT32 or dtype == types_pb2.DT_INT64:
+ bq_type = 'INTEGER'
+ else:
+ bq_type = 'STRING'
+
+ self._schema.append({'mode': 'NULLABLE',
+ 'name': friendly_name,
+ 'type': bq_type})
+ session.close()
+
+ def apply(self, datasets):
+ return self.expand(datasets)
+
+ def expand(self, datasets):
+ import json
+
+ tf_graph_predictions, errors = datasets
+
+ if self._output_format == 'json':
+ _ = (
+ tf_graph_predictions
+ | 'Write Raw JSON'
+ >> beam.io.textio.WriteToText(
+ os.path.join(self._output_dir, 'predictions'),
+ file_name_suffix='.json',
+ coder=RawJsonCoder(),
+ shard_name_template=self._shard_name_template))
+ elif self._output_format == 'csv':
+ # make a csv header file
+ header = [col['name'] for col in self._schema]
+ csv_coder = CSVCoder(header)
+ _ = (
+ tf_graph_predictions.pipeline
+ | 'Make CSV Header'
+ >> beam.Create([json.dumps(self._schema, indent=2)])
+ | 'Write CSV Schema File'
+ >> beam.io.textio.WriteToText(
+ os.path.join(self._output_dir, 'csv_schema'),
+ file_name_suffix='.json',
+ shard_name_template=''))
+
+ # Write the csv predictions
+ _ = (
+ tf_graph_predictions
+ | 'Write CSV'
+ >> beam.io.textio.WriteToText(
+ os.path.join(self._output_dir, 'predictions'),
+ file_name_suffix='.csv',
+ coder=csv_coder,
+ shard_name_template=self._shard_name_template))
+ else:
+ raise ValueError('FormatAndSave: unknown format %s', self._output_format)
+
+
+ # Write the errors to a text file.
+ _ = (errors
+ | 'Write Errors'
+ >> beam.io.textio.WriteToText(
+ os.path.join(self._output_dir, 'errors'),
+ file_name_suffix='.txt',
+ shard_name_template=self._shard_name_template))
+
+
+def make_prediction_pipeline(pipeline, args):
+ """Builds the prediction pipeline.
+
+ Reads the csv files, prepends a ',' if the target column is missing, run
+ prediction, and then prints the formated results to a file.
+
+ Args:
+ pipeline: the pipeline
+ args: command line args
+ """
+ predicted_values, errors = (
+ pipeline
+ | 'Read CSV Files'
+ >> beam.io.ReadFromText(str(args.predict_data), # DF bug: DF does not work with unicode strings
+ strip_trailing_newlines=True)
+ | 'Batch Input'
+ >> beam.ParDo(EmitAsBatchDoFn(args.batch_size))
+ | 'Run TF Graph on Batches'
+ >> (beam.ParDo(RunGraphDoFn(args.trained_model_dir))
+ .with_outputs('errors', main='main')))
+
+ _ = (
+ (predicted_values, errors)
+ | 'Format and Save'
+ >> FormatAndSave(args))
+
+
+def main(argv=None):
+ args = parse_arguments(sys.argv if argv is None else argv)
+
+ if args.cloud:
+ options = {
+ 'staging_location': os.path.join(args.output_dir, 'tmp', 'staging'),
+ 'temp_location': os.path.join(args.output_dir, 'tmp', 'staging'),
+ 'job_name': args.job_name,
+ 'project': args.project_id,
+ 'no_save_main_session': True,
+ 'extra_packages': args.extra_package,
+ 'teardown_policy': 'TEARDOWN_ALWAYS',
+ }
+ opts = beam.pipeline.PipelineOptions(flags=[], **options)
+ # Or use BlockingDataflowPipelineRunner
+ p = beam.Pipeline('DataflowRunner', options=opts)
+ else:
+ p = beam.Pipeline('DirectRunner')
+
+ make_prediction_pipeline(p, args)
+
+ if args.cloud:
+ print(('Dataflow Job submitted, see Job %s at '
+ 'https://console.developers.google.com/dataflow?project=%s') %
+ (options['job_name'], args.project_id))
+ sys.stdout.flush()
+
+ r = p.run()
+ try:
+ r.wait_until_finish()
+ except AttributeError:
+ pass
+
+
+if __name__ == '__main__':
+ main()
diff --git a/solutionbox/structured_data/datalab_solutions/structured_data/preprocess/__init__.py b/solutionbox/structured_data/datalab_solutions/structured_data/preprocess/__init__.py
new file mode 100644
index 000000000..feba1d08c
--- /dev/null
+++ b/solutionbox/structured_data/datalab_solutions/structured_data/preprocess/__init__.py
@@ -0,0 +1,16 @@
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import cloud_preprocess
+import local_preprocess
\ No newline at end of file
diff --git a/solutionbox/structured_data/datalab_solutions/structured_data/preprocess/cloud_preprocess.py b/solutionbox/structured_data/datalab_solutions/structured_data/preprocess/cloud_preprocess.py
new file mode 100644
index 000000000..2c86497b5
--- /dev/null
+++ b/solutionbox/structured_data/datalab_solutions/structured_data/preprocess/cloud_preprocess.py
@@ -0,0 +1,273 @@
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import json
+import os
+import pandas as pd
+import StringIO
+import sys
+
+
+from tensorflow.python.lib.io import file_io
+
+SCHEMA_FILE = 'schema.json'
+NUMERICAL_ANALYSIS_FILE = 'numerical_analysis.json'
+CATEGORICAL_ANALYSIS_FILE = 'vocab_%s.csv'
+
+
+def parse_arguments(argv):
+ """Parse command line arguments.
+
+ Args:
+ argv: list of command line arguments, includeing programe name.
+
+ Returns:
+ An argparse Namespace object.
+
+ Raises:
+ ValueError: for bad parameters
+ """
+ parser = argparse.ArgumentParser(
+ description='Runs Preprocessing on structured data.')
+ parser.add_argument('--output_dir',
+ type=str,
+ required=True,
+ help='Google Cloud Storage which to place outputs.')
+
+ parser.add_argument('--schema_file',
+ type=str,
+ required=False,
+ help=('BigQuery json schema file'))
+ parser.add_argument('--input_file_pattern',
+ type=str,
+ required=False,
+ help='Input CSV file names. May contain a file pattern')
+
+ # If using bigquery table
+ # TODO(brandondutra): maybe also support an sql input, so the table can be
+ # ad-hoc.
+ parser.add_argument('--bigquery_table',
+ type=str,
+ required=False,
+ help=('project:dataset.table_name'))
+
+ args = parser.parse_args(args=argv[1:])
+
+ if not args.output_dir.startswith('gs://'):
+ raise ValueError('--output_dir must point to a location on GCS')
+
+ if args.bigquery_table:
+ if args.schema_file or args.input_file_pattern:
+ raise ValueError('If using --bigquery_table, then --schema_file and '
+ '--input_file_pattern, '
+ 'are not needed.')
+ else:
+ if not args.schema_file or not args.input_file_pattern:
+ raise ValueError('If not using --bigquery_table, then --schema_file and '
+ '--input_file_pattern '
+ 'are required.')
+
+ if not args.input_file_pattern.startswith('gs://'):
+ raise ValueError('--input_file_pattern must point to files on GCS')
+
+ return args
+
+
+def parse_table_name(bigquery_table):
+ """Giving a string a:b.c, returns b.c.
+
+ Args:
+ bigquery_table: full table name project_id:dataset:table
+
+ Returns:
+ dataset:table
+
+ Raises:
+ ValueError: if a, b, or c contain the character ':'.
+ """
+
+ id_name = bigquery_table.split(':')
+ if len(id_name) != 2:
+ raise ValueError('Bigquery table name should be in the form '
+ 'project_id:dataset.table_name. Got %s' % bigquery_table)
+ return id_name[1]
+
+
+def run_numerical_analysis(table, schema_list, args):
+ """Find min/max values for the numerical columns and writes a json file.
+
+ Args:
+ table: Reference to FederatedTable (if bigquery_table is false) or a
+ regular Table (otherwise)
+ schema_list: Bigquery schema json object
+ args: the command line args
+ """
+ import datalab.bigquery as bq
+
+ # Get list of numerical columns.
+ numerical_columns = []
+ for col_schema in schema_list:
+ col_type = col_schema['type'].lower()
+ if col_type == 'integer' or col_type == 'float':
+ numerical_columns.append(col_schema['name'])
+
+
+ # Run the numerical analysis
+ if numerical_columns:
+ sys.stdout.write('Running numerical analysis...')
+ max_min = [
+ ('max({name}) as max_{name}, '
+ 'min({name}) as min_{name}, '
+ 'avg({name}) as avg_{name} ').format(name=name)
+ for name in numerical_columns]
+ if args.bigquery_table:
+ sql = 'SELECT %s from %s' % (', '.join(max_min),
+ parse_table_name(args.bigquery_table))
+ numerical_results = bq.Query(sql).to_dataframe()
+ else:
+ sql = 'SELECT %s from csv_table' % ', '.join(max_min)
+ query = bq.Query(sql, data_sources={'csv_table': table})
+ numerical_results = query.to_dataframe()
+
+ # Convert the numerical results to a json file.
+ results_dict = {}
+ for name in numerical_columns:
+ results_dict[name] = {'max': numerical_results.iloc[0]['max_%s' % name],
+ 'min': numerical_results.iloc[0]['min_%s' % name],
+ 'mean':numerical_results.iloc[0]['avg_%s' % name]}
+
+ file_io.write_string_to_file(
+ os.path.join(args.output_dir, NUMERICAL_ANALYSIS_FILE),
+ json.dumps(results_dict, indent=2, separators=(',', ': ')))
+
+ sys.stdout.write('done.\n')
+
+
+def run_categorical_analysis(table, schema_list, args):
+ """Find vocab values for the categorical columns and writes a csv file.
+
+ The vocab files are in the from
+ label1
+ label2
+ label3
+ ...
+
+ Args:
+ table: Reference to FederatedTable (if bigquery_table is false) or a
+ regular Table (otherwise)
+ schema_list: Bigquery schema json object
+ args: the command line args
+ """
+ import datalab.bigquery as bq
+
+
+ # Get list of categorical columns.
+ categorical_columns = []
+ for col_schema in schema_list:
+ col_type = col_schema['type'].lower()
+ if col_type == 'string':
+ categorical_columns.append(col_schema['name'])
+
+ if categorical_columns:
+ sys.stdout.write('Running categorical analysis...')
+ for name in categorical_columns:
+ if args.bigquery_table:
+ table_name = parse_table_name(args.bigquery_table)
+ else:
+ table_name = 'table_name'
+
+ sql = """
+ SELECT
+ {name},
+ FROM
+ {table}
+ WHERE
+ {name} IS NOT NULL
+ GROUP BY
+ {name}
+ ORDER BY
+ {name}
+ """.format(name=name, table=table_name)
+ out_file = os.path.join(args.output_dir,
+ CATEGORICAL_ANALYSIS_FILE % name)
+
+ # extract_async seems to have a bug and sometimes hangs. So get the
+ # results direclty.
+ if args.bigquery_table:
+ df = bq.Query(sql).to_dataframe()
+ else:
+ query = bq.Query(sql, data_sources={table_name: table})
+ df = query.to_dataframe()
+
+ # Write the results to a file.
+ string_buff = StringIO.StringIO()
+ df.to_csv(string_buff, index=False, header=False)
+ file_io.write_string_to_file(out_file, string_buff.getvalue())
+
+
+ sys.stdout.write('done.\n')
+
+
+def run_analysis(args):
+ """Builds an analysis file for training.
+
+ Uses BiqQuery tables to do the analysis.
+
+ Args:
+ args: command line args
+
+ Raises:
+ ValueError if schema contains unknown types.
+ """
+ import datalab.bigquery as bq
+ if args.bigquery_table:
+ table = bq.Table(args.bigquery_table)
+ schema_list = table.schema._bq_schema
+ else:
+ schema_list = json.loads(file_io.read_file_to_string(args.schema_file))
+ table = bq.FederatedTable().from_storage(
+ source=args.input_file_pattern,
+ source_format='csv',
+ ignore_unknown_values=False,
+ max_bad_records=0,
+ compressed=False,
+ schema=bq.Schema(schema_list))
+
+ # Check the schema is supported.
+ for col_schema in schema_list:
+ col_type = col_schema['type'].lower()
+ if col_type != 'string' and col_type != 'integer' and col_type != 'float':
+ raise ValueError('Unknown schema type %s' % col_type)
+
+ run_numerical_analysis(table, schema_list, args)
+ run_categorical_analysis(table, schema_list, args)
+
+ # Save a copy of the schema to the output location.
+ file_io.write_string_to_file(
+ os.path.join(args.output_dir, SCHEMA_FILE),
+ json.dumps(schema_list, indent=2, separators=(',', ': ')))
+
+
+def main(argv=None):
+ args = parse_arguments(sys.argv if argv is None else argv)
+ run_analysis(args)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/solutionbox/structured_data/datalab_solutions/structured_data/preprocess/local_preprocess.py b/solutionbox/structured_data/datalab_solutions/structured_data/preprocess/local_preprocess.py
new file mode 100644
index 000000000..5a5d1a5a9
--- /dev/null
+++ b/solutionbox/structured_data/datalab_solutions/structured_data/preprocess/local_preprocess.py
@@ -0,0 +1,166 @@
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+import argparse
+import collections
+import json
+import os
+import sys
+
+
+from tensorflow.python.lib.io import file_io
+
+
+SCHEMA_FILE = 'schema.json'
+NUMERICAL_ANALYSIS_FILE = 'numerical_analysis.json'
+CATEGORICAL_ANALYSIS_FILE = 'vocab_%s.csv'
+
+
+def parse_arguments(argv):
+ """Parse command line arguments.
+
+ Args:
+ argv: list of command line arguments, includeing programe name.
+
+ Returns:
+ An argparse Namespace object.
+ """
+ parser = argparse.ArgumentParser(
+ description='Runs Preprocessing on structured CSV data.')
+ parser.add_argument('--input_file_pattern',
+ type=str,
+ required=True,
+ help='Input CSV file names. May contain a file pattern')
+ parser.add_argument('--output_dir',
+ type=str,
+ required=True,
+ help='Google Cloud Storage which to place outputs.')
+ parser.add_argument('--schema_file',
+ type=str,
+ required=True,
+ help=('BigQuery json schema file'))
+
+ args = parser.parse_args(args=argv[1:])
+
+ # Make sure the output folder exists if local folder.
+ file_io.recursive_create_dir(args.output_dir)
+
+ return args
+
+
+def run_numerical_categorical_analysis(args, schema_list):
+ """Makes the numerical and categorical analysis files.
+
+ Args:
+ args: the command line args
+ schema_list: python object of the schema json file.
+
+ Raises:
+ ValueError: if schema contains unknown column types.
+ """
+ header = [column['name'] for column in schema_list]
+ input_files = file_io.get_matching_files(args.input_file_pattern)
+
+ # Check the schema is valid
+ for col_schema in schema_list:
+ col_type = col_schema['type'].lower()
+ if col_type != 'string' and col_type != 'integer' and col_type != 'float':
+ raise ValueError('Schema contains an unsupported type %s.' % col_type)
+
+ # initialize the results
+ def _init_numerical_results():
+ return {'min': float('inf'),
+ 'max': float('-inf'),
+ 'count': 0,
+ 'sum': 0.0}
+ numerical_results = collections.defaultdict(_init_numerical_results)
+ categorical_results = collections.defaultdict(set)
+
+ # for each file, update the numerical stats from that file, and update the set
+ # of unique labels.
+ for input_file in input_files:
+ with file_io.FileIO(input_file, 'r') as f:
+ for line in f:
+ parsed_line = dict(zip(header, line.strip().split(',')))
+
+ for col_schema in schema_list:
+ col_name = col_schema['name']
+ col_type = col_schema['type']
+ if col_type.lower() == 'string':
+ categorical_results[col_name].update([parsed_line[col_name]])
+ else:
+ # numerical column.
+
+ # if empty, skip
+ if not parsed_line[col_name].strip():
+ continue;
+
+ numerical_results[col_name]['min'] = (
+ min(numerical_results[col_name]['min'],
+ float(parsed_line[col_name])))
+ numerical_results[col_name]['max'] = (
+ max(numerical_results[col_name]['max'],
+ float(parsed_line[col_name])))
+ numerical_results[col_name]['count'] += 1
+ numerical_results[col_name]['sum'] += float(parsed_line[col_name])
+
+ # Update numerical_results to just have min/min/mean
+ for col_schema in schema_list:
+ if col_schema['type'].lower() != 'string':
+ col_name = col_schema['name']
+ mean = numerical_results[col_name]['sum'] / numerical_results[col_name]['count']
+ del numerical_results[col_name]['sum']
+ del numerical_results[col_name]['count']
+ numerical_results[col_name]['mean'] = mean
+
+
+ # Write the numerical_results to a json file.
+ file_io.write_string_to_file(
+ os.path.join(args.output_dir, NUMERICAL_ANALYSIS_FILE),
+ json.dumps(numerical_results, indent=2, separators=(',', ': ')))
+
+ # Write the vocab files. Each label is on its own line.
+ for name, unique_labels in categorical_results.iteritems():
+ labels = '\n'.join(list(unique_labels))
+ file_io.write_string_to_file(
+ os.path.join(args.output_dir, CATEGORICAL_ANALYSIS_FILE % name),
+ labels)
+
+
+def run_analysis(args):
+ """Builds an analysis files for training."""
+
+ # Read the schema and input feature types
+ schema_list = json.loads(file_io.read_file_to_string(args.schema_file))
+
+ run_numerical_categorical_analysis(args, schema_list)
+
+ # Also save a copy of the schema in the output folder.
+ file_io.copy(args.schema_file,
+ os.path.join(args.output_dir, SCHEMA_FILE),
+ overwrite=True)
+
+
+def main(argv=None):
+ args = parse_arguments(sys.argv if argv is None else argv)
+ run_analysis(args)
+
+
+if __name__ == '__main__':
+ main()
\ No newline at end of file
diff --git a/solutionbox/structured_data/datalab_solutions/structured_data/test/__init__.py b/solutionbox/structured_data/datalab_solutions/structured_data/test/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/solutionbox/structured_data/datalab_solutions/structured_data/test/e2e_functions.py b/solutionbox/structured_data/datalab_solutions/structured_data/test/e2e_functions.py
new file mode 100644
index 000000000..f057df8e8
--- /dev/null
+++ b/solutionbox/structured_data/datalab_solutions/structured_data/test/e2e_functions.py
@@ -0,0 +1,208 @@
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+
+import os
+import random
+import json
+import subprocess
+
+
+def make_csv_data(filename, num_rows, problem_type, keep_target=True):
+ """Writes csv data for preprocessing and training.
+
+ Args:
+ filename: writes data to csv file.
+ num_rows: how many rows of data will be generated.
+ problem_type: 'classification' or 'regression'. Changes the target value.
+ keep_target: if false, the csv file will have an empty column ',,' for the
+ target.
+ """
+ random.seed(12321)
+ with open(filename, 'w') as f1:
+ for i in range(num_rows):
+ num1 = random.uniform(0, 30)
+ num2 = random.randint(0, 20)
+ num3 = random.uniform(0, 10)
+
+ str1 = random.choice(['red', 'blue', 'green', 'pink', 'yellow', 'brown', 'black'])
+ str2 = random.choice(['abc', 'def', 'ghi', 'jkl', 'mno', 'pqr'])
+ str3 = random.choice(['car', 'truck', 'van', 'bike', 'train', 'drone'])
+
+ map1 = {'red': 2, 'blue': 6, 'green': 4, 'pink': -5, 'yellow': -6, 'brown': -1, 'black': 7}
+ map2 = {'abc': 10, 'def': 1, 'ghi': 1, 'jkl': 1, 'mno': 1, 'pqr': 1}
+ map3 = {'car': 5, 'truck': 10, 'van': 15, 'bike': 20, 'train': 25, 'drone': 30}
+
+ # Build some model.
+ t = 0.5 + 0.5*num1 -2.5*num2 + num3
+ t += map1[str1] + map2[str2] + map3[str3]
+
+ if problem_type == 'classification':
+ if t < 0:
+ t = 100
+ elif t < 20:
+ t = 101
+ else:
+ t = 102
+
+ if keep_target:
+ csv_line = "{id},{target},{num1},{num2},{num3},{str1},{str2},{str3}\n".format(
+ id=i,
+ target=t,
+ num1=num1,
+ num2=num2,
+ num3=num3,
+ str1=str1,
+ str2=str2,
+ str3=str3)
+ else:
+ csv_line = "{id},{num1},{num2},{num3},{str1},{str2},{str3}\n".format(
+ id=i,
+ num1=num1,
+ num2=num2,
+ num3=num3,
+ str1=str1,
+ str2=str2,
+ str3=str3)
+ f1.write(csv_line)
+
+
+def make_preprocess_schema(filename, problem_type):
+ """Makes a schema file compatable with the output of make_csv_data.
+
+ Writes a json file.
+
+ Args:
+ filename: output file path
+ problem_type: regression or classification
+ """
+ schema = [
+ {
+ "mode": "NULLABLE",
+ "name": "key",
+ "type": "STRING"
+ },
+ {
+ "mode": "REQUIRED",
+ "name": "target",
+ "type": ("STRING" if problem_type == 'classification' else "FLOAT")
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "num1",
+ "type": "FLOAT"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "num2",
+ "type": "INTEGER"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "num3",
+ "type": "FLOAT"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "str1",
+ "type": "STRING"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "str2",
+ "type": "STRING"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "str3",
+ "type": "STRING"
+ }
+ ]
+ with open(filename, 'w') as f:
+ f.write(json.dumps(schema))
+
+
+def run_preprocess(output_dir, csv_filename, schema_filename):
+ preprocess_script = os.path.abspath(
+ os.path.join(os.path.dirname(__file__),
+ '../preprocess/local_preprocess.py'))
+
+ cmd = ['python', preprocess_script,
+ '--output_dir', output_dir,
+ '--input_file_pattern', csv_filename,
+ '--schema_file', schema_filename
+ ]
+ print('Going to run command: %s' % ' '.join(cmd))
+ subprocess.check_call(cmd) #, stderr=open(os.devnull, 'wb'))
+
+
+def run_training(
+ train_data_paths,
+ eval_data_paths,
+ output_path,
+ preprocess_output_dir,
+ transforms_file,
+ max_steps,
+ model_type,
+ extra_args=[]):
+ """Runs Training via gcloud beta ml local train.
+
+ Args:
+ train_data_paths: training csv files
+ eval_data_paths: eval csv files
+ output_path: folder to write output to
+ preprocess_output_dir: output location of preprocessing
+ transforms_file: path to transforms file
+ max_steps: max training steps
+ model_type: {dnn,linear}_{regression,classification}
+ extra_args: array of strings, passed to the trainer.
+
+ Returns:
+ The stderr of training as one string. TF writes to stderr, so basically, the
+ output of training.
+ """
+
+ # Gcloud has the fun bug that you have to be in the parent folder of task.py
+ # when you call it. So cd there first.
+ task_parent_folder = os.path.abspath(
+ os.path.join(os.path.dirname(__file__), '..'))
+ cmd = ['cd %s &&' % task_parent_folder,
+ 'gcloud beta ml local train',
+ '--module-name=trainer.task',
+ '--package-path=trainer',
+ '--',
+ '--train_data_paths=%s' % train_data_paths,
+ '--eval_data_paths=%s' % eval_data_paths,
+ '--output_path=%s' % output_path,
+ '--preprocess_output_dir=%s' % preprocess_output_dir,
+ '--transforms_file=%s' % transforms_file,
+ '--model_type=%s' % model_type,
+ '--max_steps=%s' % max_steps] + extra_args
+ print('Going to run command: %s' % ' '.join(cmd))
+ sp = subprocess.Popen(' '.join(cmd), shell=True, stderr=subprocess.PIPE)
+ _, err = sp.communicate()
+ return err
+
+if __name__ == '__main__':
+ make_csv_data('raw_train_regression.csv', 5000, 'regression', True)
+ make_csv_data('raw_eval_regression.csv', 1000, 'regression', True)
+ make_csv_data('raw_predict_regression.csv', 100, 'regression', False)
+ make_preprocess_schema('schema_regression.json', 'regression')
+
+ make_csv_data('raw_train_classification.csv', 5000, 'classification', True)
+ make_csv_data('raw_eval_classification.csv', 1000, 'classification', True)
+ make_csv_data('raw_predict_classification.csv', 100, 'classification', False)
+ make_preprocess_schema('schema_classification.json', 'classification')
+
diff --git a/solutionbox/structured_data/datalab_solutions/structured_data/test/test_preprocess.py b/solutionbox/structured_data/datalab_solutions/structured_data/test/test_preprocess.py
new file mode 100644
index 000000000..c1e0dfa80
--- /dev/null
+++ b/solutionbox/structured_data/datalab_solutions/structured_data/test/test_preprocess.py
@@ -0,0 +1,104 @@
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import glob
+import json
+import os
+import shutil
+import subprocess
+import filecmp
+import tempfile
+import unittest
+
+import tensorflow as tf
+
+import e2e_functions
+
+
+class TestPreprocess(unittest.TestCase):
+
+ def setUp(self):
+ self._test_dir = tempfile.mkdtemp()
+
+ self._csv_filename = os.path.join(self._test_dir, 'raw_csv_data.csv')
+ self._schema_filename = os.path.join(self._test_dir, 'schema.json')
+
+ self._preprocess_output = os.path.join(self._test_dir, 'pout')
+
+ def tearDown(self):
+ print('TestPreprocess: removing test dir: ' + self._test_dir)
+ shutil.rmtree(self._test_dir)
+
+
+ def _make_test_data(self, problem_type):
+ """Makes input files to run preprocessing on.
+
+ Args:
+ problem_type: 'regression' or 'classification'
+ """
+ e2e_functions.make_csv_data(self._csv_filename, 100, problem_type, True)
+ e2e_functions.make_preprocess_schema(self._schema_filename, problem_type)
+
+
+ def _test_preprocess(self, problem_type):
+ self._make_test_data(problem_type)
+
+ e2e_functions.run_preprocess(
+ output_dir=self._preprocess_output,
+ csv_filename=self._csv_filename,
+ schema_filename=self._schema_filename)
+
+
+ schema_file = os.path.join(self._preprocess_output, 'schema.json')
+ numerical_analysis_file = os.path.join(self._preprocess_output, 'numerical_analysis.json')
+
+ # test schema file was copied
+ self.assertTrue(filecmp.cmp(schema_file, self._schema_filename))
+
+ expected_numerical_keys = ['num1', 'num2', 'num3']
+ if problem_type == 'regression':
+ expected_numerical_keys.append('target')
+
+ # Load the numerical analysis file and check it has the right keys
+ with open(numerical_analysis_file, 'r') as f:
+ analysis = json.load(f)
+ self.assertEqual(sorted(expected_numerical_keys), sorted(analysis.keys()))
+
+ # Check that the vocab files are made
+ expected_vocab_files = ['vocab_str1.csv', 'vocab_str2.csv',
+ 'vocab_str3.csv', 'vocab_key.csv']
+ if problem_type == 'classification':
+ expected_vocab_files.append('vocab_target.csv')
+
+ for name in expected_vocab_files:
+ vocab_file = os.path.join(self._preprocess_output, name)
+ self.assertTrue(os.path.exists(vocab_file))
+ self.assertGreater(os.path.getsize(vocab_file), 0)
+
+ all_expected_files = (expected_vocab_files + ['numerical_analysis.json',
+ 'schema.json'])
+ all_file_paths = glob.glob(os.path.join(self._preprocess_output, '*'))
+ all_files = [os.path.basename(path) for path in all_file_paths]
+ self.assertEqual(sorted(all_expected_files), sorted(all_files))
+
+
+ def testRegression(self):
+ self._test_preprocess('regression')
+
+ def testClassification(self):
+ self._test_preprocess('classification')
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/solutionbox/structured_data/datalab_solutions/structured_data/test/test_trainer.py b/solutionbox/structured_data/datalab_solutions/structured_data/test/test_trainer.py
new file mode 100644
index 000000000..6d2b7b7cd
--- /dev/null
+++ b/solutionbox/structured_data/datalab_solutions/structured_data/test/test_trainer.py
@@ -0,0 +1,244 @@
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import json
+import os
+import re
+import shutil
+import tempfile
+import unittest
+
+import e2e_functions
+
+
+class TestTrainer(unittest.TestCase):
+ """Tests training.
+
+ Each test builds a csv test dataset. Preprocessing is run on the data to
+ produce analysis. Training is then ran, and the output is collected and the
+ accuracy/loss values are inspected.
+ """
+
+ def setUp(self):
+ self._test_dir = tempfile.mkdtemp()
+ self._preprocess_output = os.path.join(self._test_dir, 'pre')
+ self._train_output = os.path.join(self._test_dir, 'train')
+
+ os.mkdir(self._preprocess_output)
+ os.mkdir(self._train_output)
+
+ self._csv_train_filename = os.path.join(self._test_dir, 'train_csv_data.csv')
+ self._csv_eval_filename = os.path.join(self._test_dir, 'eval_csv_data.csv')
+ self._schema_filename = os.path.join(self._test_dir, 'schema.json')
+ self._input_features_filename = os.path.join(self._test_dir,
+ 'input_features_file.json')
+
+ self._transforms_filename = os.path.join(self._test_dir, 'transforms.json')
+
+
+ def tearDown(self):
+ print('TestTrainer: removing test dir ' + self._test_dir)
+ shutil.rmtree(self._test_dir)
+
+
+ def _run_training(self, problem_type, model_type, transforms, extra_args=[]):
+ """Runs training.
+
+ Output is saved to _training_screen_output. Nothing from training should be
+ printed to the screen.
+
+ Args:
+ problem_type: 'regression' or 'classification'
+ model_type: 'linear' or 'dnn'
+ transform: JSON object of the transforms file.
+ extra_args: list of strings to pass to the trainer.
+ """
+ # Run preprocessing.
+ e2e_functions.make_csv_data(self._csv_train_filename, 100, problem_type, True)
+ e2e_functions.make_csv_data(self._csv_eval_filename, 100, problem_type, True)
+ e2e_functions.make_preprocess_schema(self._schema_filename, problem_type)
+
+ e2e_functions.run_preprocess(
+ output_dir=self._preprocess_output,
+ csv_filename=self._csv_train_filename,
+ schema_filename=self._schema_filename)
+
+ # Write the transforms file.
+ with open(self._transforms_filename, 'w') as f:
+ f.write(json.dumps(transforms, indent=2, separators=(',', ': ')))
+
+ # Run training and save the output.
+ output = e2e_functions.run_training(
+ train_data_paths=self._csv_train_filename,
+ eval_data_paths=self._csv_eval_filename,
+ output_path=self._train_output,
+ preprocess_output_dir=self._preprocess_output,
+ transforms_file=self._transforms_filename,
+ max_steps=2500,
+ model_type=model_type + '_' + problem_type,
+ extra_args=extra_args)
+ self._training_screen_output = output
+ #print(self._training_screen_output)
+
+
+ def _check_training_screen_output(self, accuracy=None, loss=None):
+ """Should be called after _run_training.
+
+ Inspects self._training_screen_output for correct output.
+
+ Args:
+ accuracy: float. Eval accuracy should be > than this number.
+ loss: flaot. Eval loss should be < than this number.
+ """
+ # Print the last line of training output which has the loss value.
+ lines = self._training_screen_output.splitlines()
+ for line in lines:
+ if line.startswith('INFO:tensorflow:Saving dict for global step %s:' % 2500):
+ last_line = line
+ break
+ print(last_line)
+
+ # supports positive numbers (int, real) with exponential form support.
+ positive_number_re = re.compile('[+]?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?')
+
+ # Check it made it to step 2500
+ saving_num_re = re.compile('global_step = \d+')
+ saving_num = saving_num_re.findall(last_line)
+ # saving_num == ['Saving evaluation summary for step NUM']
+ self.assertEqual(len(saving_num), 1)
+ step_num = positive_number_re.findall(saving_num[0])
+ # step_num == ['2500']
+ self.assertEqual(len(step_num), 1)
+ self.assertEqual(int(step_num[0]), 2500)
+
+ # Check the accuracy
+ if accuracy is not None:
+ accuracy_eq_num_re = re.compile('accuracy = [+]?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?')
+ accuracy_eq_num = accuracy_eq_num_re.findall(last_line)
+ # accuracy_eq_num == ['accuracy = NUM']
+ self.assertEqual(len(accuracy_eq_num), 1)
+ accuracy_num = positive_number_re.findall(accuracy_eq_num[0])
+ # accuracy_num == ['X.XXX']
+ self.assertEqual(len(accuracy_num), 1)
+ self.assertGreater(float(accuracy_num[0]), accuracy)
+
+ if loss is not None:
+ loss_eq_num_re = re.compile('loss = [+]?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?')
+ loss_eq_num = loss_eq_num_re.findall(last_line)
+ # loss_eq_num == ['loss = NUM']
+ self.assertEqual(len(loss_eq_num), 1)
+ loss_num = positive_number_re.findall(loss_eq_num[0])
+ # loss_num == ['X.XXX']
+ self.assertEqual(len(loss_num), 1)
+ self.assertLess(float(loss_num[0]), loss)
+
+
+ def _check_train_files(self):
+ model_folder = os.path.join(self._train_output,
+ 'train/export/prediction_model')
+ self.assertTrue(
+ os.path.isfile(os.path.join(model_folder, 'saved_model.pb')))
+ self.assertTrue(
+ os.path.isfile(os.path.join(model_folder, 'variables/variables.index')))
+ self.assertTrue(
+ os.path.isfile(os.path.join(model_folder, 'assets.extra/schema.json')))
+ self.assertTrue(
+ os.path.isfile(os.path.join(model_folder, 'assets.extra/transforms.json')))
+
+
+ def testRegressionDnn(self):
+ print('\n\nTesting Regression DNN')
+
+ transforms = {
+ "num1": {"transform": "scale"},
+ "num2": {"transform": "scale","value": 4},
+ "str1": {"transform": "hash_embedding", "embedding_dim": 2, "hash_bucket_size": 4},
+ "str2": {"transform": "embedding", "embedding_dim": 3},
+ "target": {"transform": "target"},
+ "key": {"transform": "key"},
+ }
+
+ extra_args = ['--layer_size1=10', '--layer_size2=10', '--layer_size3=5']
+ self._run_training(problem_type='regression',
+ model_type='dnn',
+ transforms=transforms,
+ extra_args=extra_args)
+
+ self._check_training_screen_output(loss=20)
+ self._check_train_files()
+
+
+ def testRegressionLinear(self):
+ print('\n\nTesting Regression Linear')
+
+ transforms = {
+ "num1": {"transform": "scale"},
+ "num2": {"transform": "scale","value": 4},
+ "str1": {"transform": "hash_sparse", "hash_bucket_size": 2},
+ "str2": {"transform": "hash_sparse", "hash_bucket_size": 2},
+ "str3": {"transform": "hash_sparse", "hash_bucket_size": 2},
+ "target": {"transform": "target"},
+ "key": {"transform": "key"},
+ }
+
+ self._run_training(problem_type='regression',
+ model_type='linear',
+ transforms=transforms)
+
+ self._check_training_screen_output(loss=100)
+ self._check_train_files()
+
+
+ def testClassificationDnn(self):
+ print('\n\nTesting classification DNN')
+
+ transforms = {
+ "num1": {"transform": "scale"},
+ "num2": {"transform": "scale","value": 4},
+ "str1": {"transform": "hash_one_hot", "hash_bucket_size": 4},
+ "str2": {"transform": "one_hot"},
+ "str3": {"transform": "embedding", "embedding_dim": 3},
+ "target": {"transform": "target"},
+ "key": {"transform": "key"}
+ }
+
+ extra_args = ['--layer_size1=10', '--layer_size2=10', '--layer_size3=5']
+ self._run_training(problem_type='classification',
+ model_type='dnn',
+ transforms=transforms,
+ extra_args=extra_args)
+
+ self._check_training_screen_output(accuracy=0.95, loss=0.09)
+ self._check_train_files()
+
+
+ def testClassificationLinear(self):
+ print('\n\nTesting classification Linear')
+
+ transforms = {
+ "num1": {"transform": "scale"},
+ "num2": {"transform": "scale","value": 4},
+ "str1": {"transform": "hash_sparse", "hash_bucket_size": 4},
+ "str2": {"transform": "sparse"},
+ "target": {"transform": "target"},
+ "key": {"transform": "key"},
+ }
+
+ self._run_training(problem_type='classification',
+ model_type='linear',
+ transforms=transforms)
+
+ self._check_training_screen_output(accuracy=0.90, loss=0.2)
+ self._check_train_files()
+
diff --git a/solutionbox/structured_data/datalab_solutions/structured_data/trainer/__init__.py b/solutionbox/structured_data/datalab_solutions/structured_data/trainer/__init__.py
new file mode 100755
index 000000000..dc494405f
--- /dev/null
+++ b/solutionbox/structured_data/datalab_solutions/structured_data/trainer/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import task
diff --git a/solutionbox/structured_data/datalab_solutions/structured_data/trainer/task.py b/solutionbox/structured_data/datalab_solutions/structured_data/trainer/task.py
new file mode 100755
index 000000000..5cfbd1ba3
--- /dev/null
+++ b/solutionbox/structured_data/datalab_solutions/structured_data/trainer/task.py
@@ -0,0 +1,269 @@
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import json
+import os
+import re
+import sys
+import math
+
+from . import util
+import tensorflow as tf
+from tensorflow.contrib import metrics as metrics_lib
+
+from tensorflow.contrib.learn.python.learn import learn_runner
+from tensorflow.contrib.session_bundle import manifest_pb2
+from tensorflow.python.lib.io import file_io
+
+
+def get_reader_input_fn(train_config, preprocess_output_dir, model_type,
+ data_paths, batch_size, shuffle, num_epochs=None):
+ """Builds input layer for training."""
+
+ def get_input_features():
+ """Read the input features from the given data paths."""
+ _, examples = util.read_examples(
+ input_files=data_paths,
+ batch_size=batch_size,
+ shuffle=shuffle,
+ num_epochs=num_epochs)
+ features = util.parse_example_tensor(examples=examples,
+ train_config=train_config,
+ keep_target=True)
+
+ target_name = train_config['target_column']
+ target = features.pop(target_name)
+ features, target = util.preprocess_input(
+ features=features,
+ target=target,
+ train_config=train_config,
+ preprocess_output_dir=preprocess_output_dir,
+ model_type=model_type)
+
+ return features, target
+
+ # Return a function to input the feaures into the model from a data path.
+ return get_input_features
+
+
+def get_experiment_fn(args):
+ """Builds the experiment function for learn_runner.run.
+
+ Args:
+ args: the command line args
+
+ Returns:
+ A function that returns a tf.learn experiment object.
+ """
+
+ def get_experiment(output_dir):
+ # Merge schema, input features, and transforms.
+ train_config = util.merge_metadata(args.preprocess_output_dir,
+ args.transforms_file)
+
+ # Get the model to train.
+ estimator = util.get_estimator(output_dir, train_config, args)
+
+ # Save a copy of the scehma and input to the model folder.
+ schema_file = os.path.join(args.preprocess_output_dir, util.SCHEMA_FILE)
+
+ # Make list of files to save with the trained model.
+ additional_assets = {'transforms.json': args.transforms_file,
+ util.SCHEMA_FILE: schema_file}
+ if util.is_classification_model(args.model_type):
+ target_name = train_config['target_column']
+ vocab_file_name = util.CATEGORICAL_ANALYSIS % target_name
+ vocab_file_path = os.path.join(
+ args.preprocess_output_dir, vocab_file_name)
+ assert file_io.file_exists(vocab_file_path)
+ additional_assets[vocab_file_name] = vocab_file_path
+
+ export_strategy_target = util.make_export_strategy(
+ train_config=train_config,
+ args=args,
+ keep_target=True,
+ assets_extra=additional_assets)
+ export_strategy_notarget = util.make_export_strategy(
+ train_config=train_config,
+ args=args,
+ keep_target=False,
+ assets_extra=additional_assets)
+
+ input_reader_for_train = get_reader_input_fn(
+ train_config=train_config,
+ preprocess_output_dir=args.preprocess_output_dir,
+ model_type=args.model_type,
+ data_paths=args.train_data_paths,
+ batch_size=args.train_batch_size,
+ shuffle=True,
+ num_epochs=args.num_epochs)
+
+ input_reader_for_eval = get_reader_input_fn(
+ train_config=train_config,
+ preprocess_output_dir=args.preprocess_output_dir,
+ model_type=args.model_type,
+ data_paths=args.eval_data_paths,
+ batch_size=args.eval_batch_size,
+ shuffle=False,
+ num_epochs=1)
+
+ # Set the eval metrics.
+ # TODO(brandondutra): make this work with HP tuning.
+ if util.is_classification_model(args.model_type):
+ streaming_accuracy = metrics_lib.streaming_accuracy
+ eval_metrics = {
+ ('accuracy', 'classes'): streaming_accuracy,
+ # Export the accuracy as a metric for hyperparameter tuning.
+ #('training/hptuning/metric', 'classes'): streaming_accuracy
+ }
+ else:
+ eval_metrics = None
+
+ return tf.contrib.learn.Experiment(
+ estimator=estimator,
+ train_input_fn=input_reader_for_train,
+ eval_input_fn=input_reader_for_eval,
+ train_steps=args.max_steps,
+ export_strategies=[export_strategy_target, export_strategy_notarget],
+ min_eval_frequency=args.min_eval_frequency,
+ )
+
+ # Return a function to create an Experiment.
+ return get_experiment
+
+
+def parse_arguments(argv):
+ """Parse the command line arguments."""
+ parser = argparse.ArgumentParser(
+ description=('Train a regression or classification model. Note that if '
+ 'using a DNN model, --layer_size1=NUM, --layer_size2=NUM, '
+ 'should be used. '))
+
+ # I/O file parameters
+ parser.add_argument('--train_data_paths', type=str, action='append',
+ required=True)
+ parser.add_argument('--eval_data_paths', type=str, action='append',
+ required=True)
+ parser.add_argument('--output_path', type=str, required=True)
+ parser.add_argument('--preprocess_output_dir',
+ type=str,
+ required=True,
+ help=('Output folder of preprocessing. Should contain the'
+ ' files input_features.json, schema.json, and the'
+ ' optional files numerical_analysis.json and'
+ ' vocab_str1.csv. Path must be on GCS if running'
+ ' cloud training.'))
+ parser.add_argument('--transforms_file',
+ type=str,
+ required=True,
+ help=('File describing the the transforms to apply on '
+ 'each column'))
+
+ # HP parameters
+ parser.add_argument('--learning_rate', type=float, default=0.01,
+ help='tf.train.AdamOptimizer learning rate')
+ parser.add_argument('--epsilon', type=float, default=0.0005,
+ help='tf.train.AdamOptimizer epsilon')
+ # --layer_size See below
+
+ # Model problems
+ parser.add_argument('--model_type',
+ choices=['linear_classification', 'linear_regression',
+ 'dnn_classification', 'dnn_regression'],
+ required=True)
+ parser.add_argument('--top_n',
+ type=int,
+ default=1,
+ help=('For classification problems, the output graph '
+ 'will contain the labels and scores for the top '
+ 'n classes.'))
+ # Training input parameters
+ parser.add_argument('--max_steps', type=int, default=5000,
+ help='Maximum number of training steps to perform.')
+ parser.add_argument('--num_epochs',
+ type=int,
+ help=('Maximum number of training data epochs on which '
+ 'to train. If both --max-steps and --num-epochs '
+ 'are specified, the training job will run for '
+ '--max-steps or --num-epochs, whichever occurs '
+ 'first. If unspecified will run for --max-steps.'))
+ parser.add_argument('--train_batch_size', type=int, default=1000)
+ parser.add_argument('--eval_batch_size', type=int, default=1000)
+ parser.add_argument('--min_eval_frequency', type=int, default=100,
+ help=('Minimum number of training steps between '
+ 'evaluations'))
+
+ # Training output parameters
+ parser.add_argument('--save_checkpoints_secs', type=int, default=600,
+ help=('How often the model should be checkpointed/saved '
+ 'in seconds'))
+
+ args, remaining_args = parser.parse_known_args(args=argv[1:])
+
+ # All HP parambeters must be unique, so we need to support an unknown number
+ # of --layer_size1=10 --layer_size2=10 ...
+ # Look at remaining_args for layer_size\d+ to get the layer info.
+
+ # Get number of layers
+ pattern = re.compile('layer_size(\d+)')
+ num_layers = 0
+ for other_arg in remaining_args:
+ match = re.search(pattern, other_arg)
+ if match:
+ num_layers = max(num_layers, int(match.group(1)))
+
+ # Build a new parser so we catch unknown args and missing layer_sizes.
+ parser = argparse.ArgumentParser()
+ for i in range(num_layers):
+ parser.add_argument('--layer_size%s' % str(i+1), type=int, required=True)
+
+ layer_args = vars(parser.parse_args(args=remaining_args))
+ layer_sizes = []
+ for i in range(num_layers):
+ key = 'layer_size%s' % str(i+1)
+ layer_sizes.append(layer_args[key])
+
+ assert len(layer_sizes) == num_layers
+ args.layer_sizes = layer_sizes
+
+ return args
+
+
+def main(argv=None):
+ """Run a Tensorflow model on the Iris dataset."""
+ args = parse_arguments(sys.argv if argv is None else argv)
+
+ env = json.loads(os.environ.get('TF_CONFIG', '{}'))
+ # First find out if there's a task value on the environment variable.
+ # If there is none or it is empty define a default one.
+ task_data = env.get('task') or {'type': 'master', 'index': 0}
+
+ trial = task_data.get('trial')
+ if trial is not None:
+ output_dir = os.path.join(args.output_path, trial)
+ else:
+ output_dir = args.output_path
+
+ learn_runner.run(
+ experiment_fn=get_experiment_fn(args),
+ output_dir=output_dir)
+
+
+if __name__ == '__main__':
+ tf.logging.set_verbosity(tf.logging.INFO)
+ main()
diff --git a/solutionbox/structured_data/datalab_solutions/structured_data/trainer/util.py b/solutionbox/structured_data/datalab_solutions/structured_data/trainer/util.py
new file mode 100755
index 000000000..316d98b9d
--- /dev/null
+++ b/solutionbox/structured_data/datalab_solutions/structured_data/trainer/util.py
@@ -0,0 +1,861 @@
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import json
+import multiprocessing
+import os
+import math
+from StringIO import StringIO
+
+import tensorflow as tf
+from tensorflow.python.lib.io import file_io
+
+from tensorflow.contrib.learn.python.learn.utils import input_fn_utils
+from tensorflow.contrib.learn.python.learn import export_strategy
+from tensorflow.contrib.learn.python.learn.utils import (
+ saved_model_export_utils)
+
+from tensorflow.python.ops import variables
+from tensorflow.contrib.framework.python.ops import variables as contrib_variables
+from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib
+from tensorflow.python.training import saver
+from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.framework import ops
+from tensorflow.python.client import session as tf_session
+from tensorflow.python.saved_model import builder as saved_model_builder
+from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.util import compat
+from tensorflow.python.platform import gfile
+from tensorflow.python.saved_model import signature_def_utils
+
+
+SCHEMA_FILE = 'schema.json'
+NUMERICAL_ANALYSIS = 'numerical_analysis.json'
+CATEGORICAL_ANALYSIS = 'vocab_%s.csv'
+
+
+# Constants for the Prediction Graph fetch tensors.
+PG_TARGET = 'target_from_input'
+
+PG_REGRESSION_PREDICTED_TARGET = 'predicted_target'
+PG_CLASSIFICATION_LABEL_TEMPLATE = 'top_%s_label'
+PG_CLASSIFICATION_SCORE_TEMPLATE = 'top_%s_score'
+
+# ==============================================================================
+# Exporting the last trained model to a final location
+# ==============================================================================
+
+
+def _copy_all(src_files, dest_dir):
+ # file_io.copy does not copy files into folders directly.
+ for src_file in src_files:
+ file_name = os.path.basename(src_file)
+ new_file_location = os.path.join(dest_dir, file_name)
+ file_io.copy(src_file, new_file_location, overwrite=True)
+
+
+def _recursive_copy(src_dir, dest_dir):
+ """Copy the contents of src_dir into the folder dest_dir.
+ Args:
+ src_dir: gsc or local path.
+ dest_dir: gcs or local path.
+ When called, dest_dir should exist.
+ """
+ file_io.recursive_create_dir(dest_dir)
+ for file_name in file_io.list_directory(src_dir):
+ old_path = os.path.join(src_dir, file_name)
+ new_path = os.path.join(dest_dir, file_name)
+
+ if file_io.is_directory(old_path):
+ _recursive_copy(old_path, new_path)
+ else:
+ file_io.copy(old_path, new_path, overwrite=True)
+
+def serving_from_csv_input(train_config, args, keep_target):
+ """Read the input features from a placeholder csv string tensor."""
+ examples = tf.placeholder(
+ dtype=tf.string,
+ shape=(None,),
+ name='csv_input_string')
+
+ features = parse_example_tensor(examples=examples,
+ train_config=train_config,
+ keep_target=keep_target)
+
+ if keep_target:
+ target = features.pop(train_config['target_column'])
+ else:
+ target = None
+ features, target = preprocess_input(
+ features=features,
+ target=target,
+ train_config=train_config,
+ preprocess_output_dir=args.preprocess_output_dir,
+ model_type=args.model_type)
+
+ return input_fn_utils.InputFnOps(features,
+ target,
+ {'csv_line': examples}
+ )
+
+
+def make_output_tensors(train_config, args, input_ops, model_fn_ops, keep_target=True):
+ target_name = train_config['target_column']
+ key_name = train_config['key_column']
+
+ outputs = {}
+ outputs[key_name] = tf.squeeze(input_ops.features[key_name])
+
+ if is_classification_model(args.model_type):
+
+ # build maps from ints to the origional categorical strings.
+ string_value = get_vocabulary(args.preprocess_output_dir, target_name)
+ table = tf.contrib.lookup.index_to_string_table_from_tensor(
+ mapping=string_value,
+ default_value='UNKNOWN')
+
+ # Get the label of the input target.
+ if keep_target:
+ input_target_label = table.lookup(input_ops.labels)
+ outputs[PG_TARGET] = tf.squeeze(input_target_label)
+
+ # TODO(brandondutra): get the score of the target label too.
+ probabilities = model_fn_ops.predictions['probabilities']
+
+ # get top k labels and their scores.
+ (top_k_values, top_k_indices) = tf.nn.top_k(probabilities, k=args.top_n)
+ top_k_labels = table.lookup(tf.to_int64(top_k_indices))
+
+ # Write the top_k values using 2*top_k columns.
+ num_digits = int(math.ceil(math.log(args.top_n, 10)))
+ if num_digits == 0:
+ num_digits = 1
+ for i in range(0, args.top_n):
+ # Pad i based on the size of k. So if k = 100, i = 23 -> i = '023'. This
+ # makes sorting the columns easy.
+ padded_i = str(i+1).zfill(num_digits)
+
+ label_alias = PG_CLASSIFICATION_LABEL_TEMPLATE % padded_i
+ label_tensor_name = (tf.squeeze(
+ tf.slice(top_k_labels,
+ [0, i],
+ [tf.shape(top_k_labels)[0], 1])))
+ score_alias = PG_CLASSIFICATION_SCORE_TEMPLATE % padded_i
+ score_tensor_name = (tf.squeeze(
+ tf.slice(top_k_values,
+ [0, i],
+ [tf.shape(top_k_values)[0], 1])))
+
+ outputs.update({label_alias: label_tensor_name,
+ score_alias: score_tensor_name})
+
+ else:
+ if keep_target:
+ outputs[PG_TARGET] = tf.squeeze(input_ops.labels)
+
+ scores = model_fn_ops.predictions['scores']
+ outputs[PG_REGRESSION_PREDICTED_TARGET] = tf.squeeze(scores)
+
+ return outputs
+
+
+def make_export_strategy(train_config, args, keep_target, assets_extra=None):
+ def export_fn(estimator, export_dir_base, checkpoint_path=None, eval_result=None):
+ with ops.Graph().as_default() as g:
+ contrib_variables.create_global_step(g)
+
+ input_ops = serving_from_csv_input(train_config, args, keep_target)
+ model_fn_ops = estimator._call_model_fn(input_ops.features,
+ None,
+ model_fn_lib.ModeKeys.INFER)
+ output_fetch_tensors = make_output_tensors(
+ train_config=train_config,
+ args=args,
+ input_ops=input_ops,
+ model_fn_ops=model_fn_ops,
+ keep_target=keep_target)
+
+ signature_def_map = {
+ 'serving_default':
+ signature_def_utils.predict_signature_def(
+ input_ops.default_inputs,
+ output_fetch_tensors)
+ }
+
+ if not checkpoint_path:
+ # Locate the latest checkpoint
+ checkpoint_path = saver.latest_checkpoint(estimator._model_dir)
+ if not checkpoint_path:
+ raise NotFittedError("Couldn't find trained model at %s."
+ % estimator._model_dir)
+
+ export_dir = saved_model_export_utils.get_timestamped_export_dir(
+ export_dir_base)
+
+ with tf_session.Session('') as session:
+ #variables.initialize_local_variables()
+ variables.local_variables_initializer()
+ data_flow_ops.tables_initializer()
+ saver_for_restore = saver.Saver(
+ variables.global_variables(),
+ sharded=True)
+ saver_for_restore.restore(session, checkpoint_path)
+
+ init_op = control_flow_ops.group(
+ variables.local_variables_initializer(),
+ data_flow_ops.tables_initializer())
+
+ # Perform the export
+ builder = saved_model_builder.SavedModelBuilder(export_dir)
+ builder.add_meta_graph_and_variables(
+ session, [tag_constants.SERVING],
+ signature_def_map=signature_def_map,
+ assets_collection=ops.get_collection(
+ ops.GraphKeys.ASSET_FILEPATHS),
+ legacy_init_op=init_op)
+ builder.save(False)
+
+ # Add the extra assets
+ if assets_extra:
+ assets_extra_path = os.path.join(compat.as_bytes(export_dir),
+ compat.as_bytes('assets.extra'))
+ for dest_relative, source in assets_extra.items():
+ dest_absolute = os.path.join(compat.as_bytes(assets_extra_path),
+ compat.as_bytes(dest_relative))
+ dest_path = os.path.dirname(dest_absolute)
+ gfile.MakeDirs(dest_path)
+ gfile.Copy(source, dest_absolute)
+
+ # only keep the last 3 models
+ saved_model_export_utils.garbage_collect_exports(export_dir_base, exports_to_keep=3)
+
+ # save the last model to the model folder.
+ # export_dir_base = A/B/intermediate_models/
+ if keep_target:
+ final_dir = os.path.join(args.output_path, 'evaluation_model')
+ else:
+ final_dir = os.path.join(args.output_path, 'model')
+ if file_io.is_directory(final_dir):
+ file_io.delete_recursively(final_dir)
+ file_io.recursive_create_dir(final_dir)
+ _recursive_copy(export_dir, final_dir)
+
+
+ return export_dir
+
+ if keep_target:
+ intermediate_dir = 'intermediate_evaluation_models'
+ else:
+ intermediate_dir = 'intermediate_prediction_models'
+
+ return export_strategy.ExportStrategy(intermediate_dir, export_fn)
+
+
+# ==============================================================================
+# Reading the input csv files and parsing its output into tensors.
+# ==============================================================================
+
+
+def parse_example_tensor(examples, train_config, keep_target):
+ """Read the csv files.
+
+ Args:
+ examples: string tensor
+ train_config: training config
+ keep_target: if true, the target column is expected to exist and it is
+ returned in the features dict.
+
+ Returns:
+ Dict of feature_name to tensor. Target feature is in the dict.
+ """
+
+ csv_header = []
+ if keep_target:
+ csv_header = train_config['csv_header']
+ else:
+ csv_header = [name for name in train_config['csv_header']
+ if name != train_config['target_column']]
+
+ # record_defaults are used by tf.decode_csv to insert defaults, and to infer
+ # the datatype.
+ record_defaults = [[train_config['csv_defaults'][name]]
+ for name in csv_header]
+ tensors = tf.decode_csv(examples, record_defaults, name='csv_to_tensors')
+
+ # I'm not really sure why expand_dims needs to be called. If using regression
+ # models, it errors without it.
+ tensors = [tf.expand_dims(x, axis=1) for x in tensors]
+
+ tensor_dict = dict(zip(csv_header, tensors))
+ return tensor_dict
+
+
+def read_examples(input_files, batch_size, shuffle, num_epochs=None):
+ """Creates readers and queues for reading example protos."""
+ files = []
+ for e in input_files:
+ for path in e.split(','):
+ files.extend(file_io.get_matching_files(path))
+ thread_count = multiprocessing.cpu_count()
+
+ # The minimum number of instances in a queue from which examples are drawn
+ # randomly. The larger this number, the more randomness at the expense of
+ # higher memory requirements.
+ min_after_dequeue = 1000
+
+ # When batching data, the queue's capacity will be larger than the batch_size
+ # by some factor. The recommended formula is (num_threads + a small safety
+ # margin). For now, we use a single thread for reading, so this can be small.
+ queue_size_multiplier = thread_count + 3
+
+ # Convert num_epochs == 0 -> num_epochs is None, if necessary
+ num_epochs = num_epochs or None
+
+ # Build a queue of the filenames to be read.
+ filename_queue = tf.train.string_input_producer(files, num_epochs, shuffle)
+
+ example_id, encoded_example = tf.TextLineReader().read_up_to(
+ filename_queue, batch_size)
+
+ if shuffle:
+ capacity = min_after_dequeue + queue_size_multiplier * batch_size
+ return tf.train.shuffle_batch(
+ [example_id, encoded_example],
+ batch_size,
+ capacity,
+ min_after_dequeue,
+ enqueue_many=True,
+ num_threads=thread_count)
+
+ else:
+ capacity = queue_size_multiplier * batch_size
+ return tf.train.batch(
+ [example_id, encoded_example],
+ batch_size,
+ capacity=capacity,
+ enqueue_many=True,
+ num_threads=thread_count)
+
+
+# ==============================================================================
+# Building the TF learn estimators
+# ==============================================================================
+
+
+def get_estimator(output_dir, train_config, args):
+ """Returns a tf learn estimator.
+
+ We only support {DNN, Linear}Regressor and {DNN, Linear}Classifier. This is
+ controlled by the values of model_type in the args.
+
+ Args:
+ output_dir: Modes are saved into outputdir/train
+ train_config: our training config
+ args: command line parameters
+
+ Returns:
+ TF lean estimator
+
+ Raises:
+ ValueError: if config is wrong.
+ """
+
+ # Check the requested mode fits the preprocessed data.
+ target_name = train_config['target_column']
+ if (is_classification_model(args.model_type) and
+ target_name not in train_config['categorical_columns']):
+ raise ValueError('When using a classification model, the target must be a '
+ 'categorical variable.')
+ if (is_regression_model(args.model_type) and
+ target_name not in train_config['numerical_columns']):
+ raise ValueError('When using a regression model, the target must be a '
+ 'numerical variable.')
+
+ # Check layers used for dnn models.
+ if is_dnn_model(args.model_type) and not args.layer_sizes:
+ raise ValueError('--layer_size* must be used with DNN models')
+ if is_linear_model(args.model_type) and args.layer_sizes:
+ raise ValueError('--layer_size* cannot be used with linear models')
+
+ # Build tf.learn features
+ feature_columns = _tflearn_features(train_config, args)
+
+ # Set how often to run checkpointing in terms of time.
+ config = tf.contrib.learn.RunConfig(
+ save_checkpoints_secs=args.save_checkpoints_secs)
+
+ train_dir = os.path.join(output_dir, 'train')
+ if args.model_type == 'dnn_regression':
+ estimator = tf.contrib.learn.DNNRegressor(
+ feature_columns=feature_columns,
+ hidden_units=args.layer_sizes,
+ config=config,
+ model_dir=train_dir,
+ optimizer=tf.train.AdamOptimizer(
+ args.learning_rate, epsilon=args.epsilon))
+ elif args.model_type == 'linear_regression':
+ estimator = tf.contrib.learn.LinearRegressor(
+ feature_columns=feature_columns,
+ config=config,
+ model_dir=train_dir,
+ optimizer=tf.train.AdamOptimizer(
+ args.learning_rate, epsilon=args.epsilon))
+ elif args.model_type == 'dnn_classification':
+ estimator = tf.contrib.learn.DNNClassifier(
+ feature_columns=feature_columns,
+ hidden_units=args.layer_sizes,
+ n_classes=train_config['vocab_stats'][target_name]['n_classes'],
+ config=config,
+ model_dir=train_dir,
+ optimizer=tf.train.AdamOptimizer(
+ args.learning_rate, epsilon=args.epsilon))
+ elif args.model_type == 'linear_classification':
+ estimator = tf.contrib.learn.LinearClassifier(
+ feature_columns=feature_columns,
+ n_classes=train_config['vocab_stats'][target_name]['n_classes'],
+ config=config,
+ model_dir=train_dir,
+ optimizer=tf.train.AdamOptimizer(
+ args.learning_rate, epsilon=args.epsilon))
+ else:
+ raise ValueError('bad --model_type value')
+
+ return estimator
+
+
+def preprocess_input(features, target, train_config, preprocess_output_dir,
+ model_type):
+ """Perform some transformations after reading in the input tensors.
+
+ Args:
+ features: dict of feature_name to tensor
+ target: tensor
+ train_config: our training config object
+ preprocess_output_dir: folder should contain the vocab files.
+ model_type: the tf model type.
+
+ Raises:
+ ValueError: if wrong transforms are used
+
+ Returns:
+ New features dict and new target tensor.
+ """
+
+ target_name = train_config['target_column']
+ key_name = train_config['key_column']
+
+ # Do the numerical transforms.
+ # Numerical transforms supported for regression/classification
+ # 1) num -> do nothing (identity, default)
+ # 2) num -> scale to -1, 1 (scale)
+ # 3) num -> scale to -a, a (scale with value parameter)
+ with tf.name_scope('numerical_feature_preprocess') as scope:
+ if train_config['numerical_columns']:
+ numerical_analysis_file = os.path.join(preprocess_output_dir,
+ NUMERICAL_ANALYSIS)
+ if not file_io.file_exists(numerical_analysis_file):
+ raise ValueError('File %s not found in %s' %
+ (NUMERICAL_ANALYSIS, preprocess_output_dir))
+
+ numerical_anlysis = json.loads(
+ file_io.read_file_to_string(numerical_analysis_file))
+
+ for name in train_config['numerical_columns']:
+ if name == target_name or name == key_name:
+ continue
+
+ transform_config = train_config['transforms'].get(name, {})
+ transform_name = transform_config.get('transform', None)
+ if transform_name == 'scale':
+ value = float(transform_config.get('value', 1.0))
+ features[name] = _scale_tensor(
+ features[name],
+ range_min=numerical_anlysis[name]['min'],
+ range_max=numerical_anlysis[name]['max'],
+ scale_min=-value,
+ scale_max=value)
+ elif transform_name == 'identity' or transform_name is None:
+ pass
+ else:
+ raise ValueError(('For numerical variables, only scale '
+ 'and identity are supported: '
+ 'Error for %s') % name)
+
+ # Do target transform if it exists.
+ if target is not None:
+ with tf.name_scope('target_feature_preprocess') as scope:
+ if target_name in train_config['categorical_columns']:
+ labels = train_config['vocab_stats'][target_name]['labels']
+ table = tf.contrib.lookup.string_to_index_table_from_tensor(labels)
+ target = table.lookup(target)
+ #target = tf.contrib.lookup.string_to_index(target, labels)
+
+ # Do categorical transforms. Only apply vocab mapping. The real
+ # transforms are done with tf learn column features.
+ with tf.name_scope('categorical_feature_preprocess') as scope:
+ for name in train_config['categorical_columns']:
+ if name == key_name or name == target_name:
+ continue
+ transform_config = train_config['transforms'].get(name, {})
+ transform_name = transform_config.get('transform', None)
+
+ # Supported transforms:
+ # for DNN
+ # 1) string -> hash -> embedding (hash_embedding)
+ # 2) string -> make int -> embedding (embedding)
+ # 3) string -> hash -> one_hot (hash_one_hot)
+ # 4) string -> make int -> one_hot (one_hot, default)
+ # for linear
+ # 1) string -> make int -> sparse_column_with_integerized_feature (sparse, default)
+ # 2) string -> sparse_column_with_hash_bucket (hash_sparse)
+ if is_dnn_model(model_type):
+ if (transform_name == 'hash_embedding' or
+ transform_name == 'hash_one_hot'):
+ map_vocab = False
+ elif (transform_name == 'embedding' or
+ transform_name == 'one_hot' or
+ transform_name is None):
+ map_vocab = True
+ else:
+ raise ValueError('For DNN modles, only hash_embedding, '
+ 'hash_one_hot, embedding, and one_hot transforms '
+ 'are supported.')
+ elif is_linear_model(model_type):
+ if (transform_name == 'sparse' or
+ transform_name is None):
+ map_vocab = True
+ elif transform_name == 'hash_sparse':
+ map_vocab = False
+ else:
+ raise ValueError('For linear models, only sparse and '
+ 'hash_sparse are supported.')
+ if map_vocab:
+ labels = train_config['vocab_stats'][name]['labels']
+ table = tf.contrib.lookup.string_to_index_table_from_tensor(labels)
+ features[name] = table.lookup(features[name])
+
+ return features, target
+
+
+def _scale_tensor(tensor, range_min, range_max, scale_min, scale_max):
+ """Scale a tensor to scale_min to scale_max.
+
+ Args:
+ tensor: input tensor. Should be a numerical tensor.
+ range_min: min expected value for this feature/tensor.
+ range_max: max expected Value.
+ scale_min: new expected min value.
+ scale_max: new expected max value.
+
+ Returns:
+ scaled tensor.
+ """
+ if range_min == range_max:
+ return tensor
+
+ float_tensor = tf.to_float(tensor)
+ scaled_tensor = tf.divide(
+ (tf.subtract(float_tensor, range_min)
+ * tf.constant(float(scale_max - scale_min))),
+ tf.constant(float(range_max - range_min)))
+ shifted_tensor = scaled_tensor + tf.constant(float(scale_min))
+
+ return shifted_tensor
+
+
+def _tflearn_features(train_config, args):
+ """Builds the tf.learn feature list.
+
+ All numerical features are just given real_valued_column because all the
+ preprocessing transformations are done in preprocess_input. Categoriacl
+ features are processed here depending if the vocab map (from string to int)
+ was applied in preprocess_input.
+
+ Args:
+ train_config: our train config object
+ args: command line args.
+
+ Returns:
+ List of TF lean feature columns.
+
+ Raises:
+ ValueError: if wrong transforms are used for the model type.
+ """
+ feature_columns = []
+ target_name = train_config['target_column']
+ key_name = train_config['key_column']
+
+ for name in train_config['numerical_columns']:
+ if name != target_name and name != key_name:
+ feature_columns.append(tf.contrib.layers.real_valued_column(
+ name,
+ dimension=1))
+
+ for name in train_config['categorical_columns']:
+ if name != target_name and name != key_name:
+ transform_config = train_config['transforms'].get(name, {})
+ transform_name = transform_config.get('transform', None)
+
+ if is_dnn_model(args.model_type):
+ if transform_name == 'hash_embedding':
+ sparse = tf.contrib.layers.sparse_column_with_hash_bucket(
+ name,
+ hash_bucket_size=transform_config['hash_bucket_size'])
+ learn_feature = tf.contrib.layers.embedding_column(
+ sparse,
+ dimension=transform_config['embedding_dim'])
+ elif transform_name == 'hash_one_hot':
+ sparse = tf.contrib.layers.sparse_column_with_hash_bucket(
+ name,
+ hash_bucket_size=transform_config['hash_bucket_size'])
+ learn_feature = tf.contrib.layers.embedding_column(
+ sparse,
+ dimension=train_config['vocab_stats'][name]['n_classes'])
+ elif transform_name == 'embedding':
+ sparse = tf.contrib.layers.sparse_column_with_integerized_feature(
+ name,
+ bucket_size=train_config['vocab_stats'][name]['n_classes'])
+ learn_feature = tf.contrib.layers.embedding_column(
+ sparse,
+ dimension=transform_config['embedding_dim'])
+ elif transform_name == 'one_hot' or transform_name is None:
+ sparse = tf.contrib.layers.sparse_column_with_integerized_feature(
+ name,
+ bucket_size=train_config['vocab_stats'][name]['n_classes'])
+ learn_feature = tf.contrib.layers.one_hot_column(sparse)
+ else:
+ raise ValueError('For DNN modles, only hash_embedding, '
+ 'hash_one_hot, embedding, and one_hot transforms '
+ 'are supported.')
+ elif is_linear_model(args.model_type):
+ if transform_name == 'sparse' or transform_name is None:
+ learn_feature = tf.contrib.layers.sparse_column_with_integerized_feature(
+ name,
+ bucket_size=train_config['vocab_stats'][name]['n_classes'])
+ elif transform_name == 'hash_sparse':
+ learn_feature = tf.contrib.layers.sparse_column_with_hash_bucket(
+ name,
+ hash_bucket_size=transform_config['hash_bucket_size'])
+ else:
+ raise ValueError('For linear models, only sparse and '
+ 'hash_sparse are supported.')
+
+ # Save the feature
+ feature_columns.append(learn_feature)
+ return feature_columns
+
+
+# ==============================================================================
+# Building the TF learn estimators
+# ==============================================================================
+
+
+def get_vocabulary(preprocess_output_dir, name):
+ """Loads the vocabulary file as a list of strings.
+
+ Args:
+ preprocess_output_dir: Should contain the file CATEGORICAL_ANALYSIS % name.
+ name: name of the csv column.
+
+ Returns:
+ List of strings.
+
+ Raises:
+ ValueError: if file is missing.
+ """
+ vocab_file = os.path.join(preprocess_output_dir, CATEGORICAL_ANALYSIS % name)
+ if not file_io.file_exists(vocab_file):
+ raise ValueError('File %s not found in %s' %
+ (CATEGORICAL_ANALYSIS % name, preprocess_output_dir))
+
+ labels = file_io.read_file_to_string(vocab_file).split('\n')
+ label_values = [x for x in labels if x] # remove empty lines
+
+ return label_values
+
+
+def merge_metadata(preprocess_output_dir, transforms_file):
+ """Merge schema, analysis, and transforms files into one python object.
+
+ Args:
+ preprocess_output_dir: the output folder of preprocessing. Should contain
+ the schema, and the numerical and categorical
+ analysis files.
+ transforms_file: the training transforms file.
+
+ Returns:
+ A dict in the form
+ {
+ csv_header: [name1, name2, ...],
+ csv_defaults: {name1: value, name2: value},
+ key_column: name,
+ target_column: name,
+ categorical_columns: []
+ numerical_columns: []
+ transforms: { name1: {transform: scale, value: 2},
+ name2: {transform: embedding, dim: 50}, ...
+ }
+ vocab_stats: { name3: {n_classes: 23, labels: ['1', '2', ..., '23']},
+ name4: {n_classes: 102, labels: ['red', 'blue', ...]}}
+ }
+
+ Raises:
+ ValueError: if one of the input metadata files is wrong.
+ """
+ numerical_anlysis_file = os.path.join(preprocess_output_dir,
+ NUMERICAL_ANALYSIS)
+ schema_file = os.path.join(preprocess_output_dir, SCHEMA_FILE)
+
+ numerical_anlysis = json.loads(file_io.read_file_to_string(
+ numerical_anlysis_file))
+ schema = json.loads(file_io.read_file_to_string(schema_file))
+ transforms = json.loads(file_io.read_file_to_string(transforms_file))
+
+ result_dict = {}
+ result_dict['csv_header'] = [col_schema['name'] for col_schema in schema]
+ result_dict['key_column'] = None
+ result_dict['target_column'] = None
+ result_dict['categorical_columns'] = []
+ result_dict['numerical_columns'] = []
+ result_dict['transforms'] = {}
+ result_dict['csv_defaults'] = {}
+ result_dict['vocab_stats'] = {}
+
+ # get key column.
+ for name, trans_config in transforms.iteritems():
+ if trans_config.get('transform', None) == 'key':
+ result_dict['key_column'] = name
+ break
+ if result_dict['key_column'] is None:
+ raise ValueError('Key transform missing form transfroms file.')
+
+ # get target column.
+ result_dict['target_column'] = schema[0]['name']
+ for name, trans_config in transforms.iteritems():
+ if trans_config.get('transform', None) == 'target':
+ result_dict['target_column'] = name
+ break
+ if result_dict['target_column'] is None:
+ raise ValueError('Target transform missing from transforms file.')
+
+ # Get the numerical/categorical columns.
+ for col_schema in schema:
+ col_name = col_schema['name']
+ col_type = col_schema['type'].lower()
+ if col_name == result_dict['key_column']:
+ continue
+
+ if col_type == 'string':
+ result_dict['categorical_columns'].append(col_name)
+ elif col_type == 'integer' or col_type == 'float':
+ result_dict['numerical_columns'].append(col_name)
+ else:
+ raise ValueError('Unsupported schema type %s' % col_type)
+
+ # Get the transforms.
+ for name, trans_config in transforms.iteritems():
+ if name != result_dict['target_column'] and name != result_dict['key_column']:
+ result_dict['transforms'][name] = trans_config
+
+ # Get the vocab_stats
+ for name in result_dict['categorical_columns']:
+ if name == result_dict['key_column']:
+ continue
+
+ label_values = get_vocabulary(preprocess_output_dir, name)
+ if name != result_dict['target_column'] and '' not in label_values:
+ label_values.append('') # append a 'missing' label.
+ n_classes = len(label_values)
+ result_dict['vocab_stats'][name] = {'n_classes': n_classes,
+ 'labels': label_values}
+
+ # Get the csv_defaults
+ for col_schema in schema:
+ name = col_schema['name']
+ col_type = col_schema['type'].lower()
+ default = transforms.get(name, {}).get('default', None)
+
+ if name == result_dict['target_column']:
+ if name in result_dict['numerical_columns']:
+ default = float(default or 0.0)
+ else:
+ default = default or ''
+ elif name == result_dict['key_column']:
+ if col_type == 'string':
+ default = str(default or '')
+ elif col_type == 'float':
+ default = float(default or 0.0)
+ else:
+ default = int(default or 0)
+ else:
+ if col_type == 'string':
+ default = str(default or '')
+ if default not in result_dict['vocab_stats'][name]['labels']:
+ raise ValueError('Default %s is not in the vocab for %s' %
+ (default, name))
+ else:
+ default = float(default or numerical_anlysis[name]['mean'])
+
+ result_dict['csv_defaults'][name] = default
+
+ validate_metadata(result_dict)
+ return result_dict
+
+
+def validate_metadata(train_config):
+ """Perform some checks that the trainig config is correct.
+
+ Args:
+ train_config: train config as produced by merge_metadata()
+
+ Raises:
+ ValueError: if columns look wrong.
+ """
+
+ # Make sure we have a default for every column
+ if len(train_config['csv_header']) != len(train_config['csv_defaults']):
+ raise ValueError('Unequal number of columns in input features file and '
+ 'schema file.')
+
+ # Check there are no missing columns. sorted_colums has two copies of the
+ # target column because the target column is also listed in
+ # categorical_columns or numerical_columns.
+ sorted_columns = sorted(train_config['csv_header']
+ + [train_config['target_column']])
+
+ sorted_columns2 = sorted(train_config['categorical_columns']
+ + train_config['numerical_columns']
+ + [train_config['key_column']]
+ + [train_config['target_column']])
+ if sorted_columns2 != sorted_columns:
+ raise ValueError('Each csv header must be a numerical/categorical type, a '
+ ' key, or a target.')
+
+
+def is_linear_model(model_type):
+ return model_type.startswith('linear_')
+
+
+def is_dnn_model(model_type):
+ return model_type.startswith('dnn_')
+
+
+def is_regression_model(model_type):
+ return model_type.endswith('_regression')
+
+
+def is_classification_model(model_type):
+ return model_type.endswith('_classification')
diff --git a/solutionbox/structured_data/setup.py b/solutionbox/structured_data/setup.py
new file mode 100644
index 000000000..aee640276
--- /dev/null
+++ b/solutionbox/structured_data/setup.py
@@ -0,0 +1,73 @@
+# Copyright 2017 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+
+# A copy of this file must be made in datalab_solutions/structured_data/setup.py
+
+import datetime
+import os
+import re
+from setuptools import setup
+
+
+
+# The version is saved in an __init__ file.
+def get_version():
+ VERSIONFILE = os.path.join('datalab_solutions/structured_data/',
+ '__init__.py')
+ if not os.path.isfile(VERSIONFILE):
+ raise ValueError('setup.py: File not found %s' % VERSIONFILE)
+ initfile_lines = open(VERSIONFILE, 'rt').readlines()
+ VSRE = r"^__version__ = ['\"]([^'\"]*)['\"]"
+ for line in initfile_lines:
+ mo = re.search(VSRE, line, re.M)
+ if mo:
+ return mo.group(1)
+ raise RuntimeError('Unable to find version string in %s.' % (VERSIONFILE,))
+
+
+setup(
+ name='structured_data',
+ version=get_version(),
+ packages=[
+ 'datalab_solutions',
+ 'datalab_solutions.structured_data',
+ 'datalab_solutions.structured_data.trainer',
+ 'datalab_solutions.structured_data.preprocess',
+ 'datalab_solutions.structured_data.predict',
+ ],
+ description='Google Cloud Datalab Structured Data Package',
+ author='Google',
+ author_email='google-cloud-datalab-feedback@googlegroups.com',
+ keywords=[
+ ],
+ license="Apache Software License",
+ classifiers=[
+ "Programming Language :: Python",
+ "Programming Language :: Python :: 2",
+ "Development Status :: 4 - Beta",
+ "Environment :: Other Environment",
+ "Intended Audience :: Developers",
+ "License :: OSI Approved :: Apache Software License",
+ "Operating System :: OS Independent",
+ "Topic :: Software Development :: Libraries :: Python Modules"
+ ],
+ long_description="""
+ """,
+ install_requires=[
+ 'tensorflow==1.0',
+ 'protobuf==3.1.0',
+ 'google-cloud-dataflow==0.5.5'
+ ],
+ package_data={
+ },
+ data_files=[],
+)