diff --git a/datalab/mlalpha/__init__.py b/datalab/ml/__init__.py similarity index 61% rename from datalab/mlalpha/__init__.py rename to datalab/ml/__init__.py index 9dc5b2b88..99bc309a7 100644 --- a/datalab/mlalpha/__init__.py +++ b/datalab/ml/__init__.py @@ -14,20 +14,14 @@ from __future__ import absolute_import -from ._local_runner import LocalRunner -from ._cloud_runner import CloudRunner -from ._metadata import Metadata -from ._local_predictor import LocalPredictor -from ._cloud_predictor import CloudPredictor -from ._job import Jobs +from ._job import Jobs, Job from ._summary import Summary -from ._tensorboard import TensorBoardManager -from ._dataset import DataSet -from ._package import Packager -from ._cloud_models import CloudModels, CloudModelVersions +from ._tensorboard import TensorBoard +from ._dataset import CsvDataSet, BigQueryDataSet +from ._cloud_models import Models, ModelVersions from ._confusion_matrix import ConfusionMatrix +from ._feature_slice_view import FeatureSliceView +from ._cloud_training_config import CloudTrainingConfig +from ._util import * -from plotly.offline import init_notebook_mode - -init_notebook_mode() diff --git a/datalab/ml/_cloud_models.py b/datalab/ml/_cloud_models.py new file mode 100644 index 000000000..5e5098e0a --- /dev/null +++ b/datalab/ml/_cloud_models.py @@ -0,0 +1,274 @@ +# Copyright 2016 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. + +"""Implements Cloud ML Model Operations""" + +from googleapiclient import discovery +import os +import yaml + +import datalab.context +import datalab.storage +import datalab.utils + +from . import _util + +class Models(object): + """Represents a list of Cloud ML models for a project.""" + + def __init__(self, project_id=None): + """ + Args: + project_id: project_id of the models. If not provided, default project_id will be used. + """ + if project_id is None: + project_id = datalab.context.Context.default().project_id + self._project_id = project_id + self._credentials = datalab.context.Context.default().credentials + self._api = discovery.build('ml', 'v1', credentials=self._credentials) + + def _retrieve_models(self, page_token, page_size): + list_info = self._api.projects().models().list( + parent='projects/' + self._project_id, pageToken=page_token, pageSize=page_size).execute() + models = list_info.get('models', []) + page_token = list_info.get('nextPageToken', None) + return models, page_token + + def get_iterator(self): + """Get iterator of models so it can be used as "for model in Models().get_iterator()". + """ + return iter(datalab.utils.Iterator(self._retrieve_models)) + + def get_model_details(self, model_name): + """Get details of the specified model from CloudML Service. + + Args: + model_name: the name of the model. It can be a model full name + ("projects/[project_id]/models/[model_name]") or just [model_name]. + Returns: a dictionary of the model details. + """ + full_name = model_name + if not model_name.startswith('projects/'): + full_name = ('projects/%s/models/%s' % (self._project_id, model_name)) + return self._api.projects().models().get(name=full_name).execute() + + def create(self, model_name): + """Create a model. + + Args: + model_name: the short name of the model, such as "iris". + Returns: + If successful, returns informaiton of the model, such as + {u'regions': [u'us-central1'], u'name': u'projects/myproject/models/mymodel'} + Raises: + If the model creation failed. + """ + body = {'name': model_name} + parent = 'projects/' + self._project_id + # Model creation is instant. If anything goes wrong, Exception will be thrown. + return self._api.projects().models().create(body=body, parent=parent).execute() + + def delete(self, model_name): + """Delete a model. + + Args: + model_name: the name of the model. It can be a model full name + ("projects/[project_id]/models/[model_name]") or just [model_name]. + """ + full_name = model_name + if not model_name.startswith('projects/'): + full_name = ('projects/%s/models/%s' % (self._project_id, model_name)) + response = self._api.projects().models().delete(name=full_name).execute() + if 'name' not in response: + raise Exception('Invalid response from service. "name" is not found.') + _util.wait_for_long_running_operation(response['name']) + + def list(self, count=10): + """List models under the current project in a table view. + + Args: + count: upper limit of the number of models to list. + Raises: + Exception if it is called in a non-IPython environment. + """ + import IPython + data = [] + # Add range(count) to loop so it will stop either it reaches count, or iteration + # on self is exhausted. "self" is iterable (see __iter__() method). + for _, model in zip(range(count), self): + element = {'name': model['name']} + if 'defaultVersion' in model: + version_short_name = model['defaultVersion']['name'].split('/')[-1] + element['defaultVersion'] = version_short_name + data.append(element) + + IPython.display.display( + datalab.utils.commands.render_dictionary(data, ['name', 'defaultVersion'])) + + def describe(self, model_name): + """Print information of a specified model. + + Args: + model_name: the name of the model to print details on. + """ + model_yaml = yaml.safe_dump(self.get_model_details(model_name), default_flow_style=False) + print model_yaml + + +class ModelVersions(object): + """Represents a list of versions for a Cloud ML model.""" + + def __init__(self, model_name, project_id=None): + """ + Args: + model_name: the name of the model. It can be a model full name + ("projects/[project_id]/models/[model_name]") or just [model_name]. + project_id: project_id of the models. If not provided and model_name is not a full name + (not including project_id), default project_id will be used. + """ + if project_id is None: + self._project_id = datalab.context.Context.default().project_id + self._credentials = datalab.context.Context.default().credentials + self._api = discovery.build('ml', 'v1', credentials=self._credentials) + if not model_name.startswith('projects/'): + model_name = ('projects/%s/models/%s' % (self._project_id, model_name)) + self._full_model_name = model_name + self._model_name = self._full_model_name.split('/')[-1] + + def _retrieve_versions(self, page_token, page_size): + parent = self._full_model_name + list_info = self._api.projects().models().versions().list(parent=parent, + pageToken=page_token, pageSize=page_size).execute() + versions = list_info.get('versions', []) + page_token = list_info.get('nextPageToken', None) + return versions, page_token + + def get_iterator(self): + """Get iterator of versions so it can be used as + "for v in ModelVersions(model_name).get_iterator()". + """ + return iter(datalab.utils.Iterator(self._retrieve_versions)) + + def get_version_details(self, version_name): + """Get details of a version. + + Args: + version: the name of the version in short form, such as "v1". + Returns: a dictionary containing the version details. + """ + name = ('%s/versions/%s' % (self._full_model_name, version_name)) + return self._api.projects().models().versions().get(name=name).execute() + + def deploy(self, version_name, path): + """Deploy a model version to the cloud. + + Args: + version_name: the name of the version in short form, such as "v1". + path: the Google Cloud Storage path (gs://...) which contains the model files. + + Raises: Exception if the path is invalid or does not contain expected files. + Exception if the service returns invalid response. + """ + if not path.startswith('gs://'): + raise Exception('Invalid path. Only Google Cloud Storage path (gs://...) is accepted.') + + # If there is no "export.meta" or"saved_model.pb" under path but there is + # path/model/export.meta or path/model/saved_model.pb, then append /model to the path. + if (not datalab.storage.Item.from_url(os.path.join(path, 'export.meta')).exists() and + not datalab.storage.Item.from_url(os.path.join(path, 'saved_model.pb')).exists()): + if (datalab.storage.Item.from_url(os.path.join(path, 'model', 'export.meta')).exists() or + datalab.storage.Item.from_url(os.path.join(path, 'model', 'saved_model.pb')).exists()): + path = os.path.join(path, 'model') + else: + print('Cannot find export.meta or saved_model.pb, but continue with deployment anyway.') + + body = {'name': self._model_name} + parent = 'projects/' + self._project_id + try: + self._api.projects().models().create(body=body, parent=parent).execute() + except: + # Trying to create an already existing model gets an error. Ignore it. + pass + body = { + 'name': version_name, + 'deployment_uri': path, + 'runtime_version': '1.0', + } + response = self._api.projects().models().versions().create(body=body, + parent=self._full_model_name).execute() + if 'name' not in response: + raise Exception('Invalid response from service. "name" is not found.') + _util.wait_for_long_running_operation(response['name']) + + def delete(self, version_name): + """Delete a version of model. + + Args: + version_name: the name of the version in short form, such as "v1". + """ + name = ('%s/versions/%s' % (self._full_model_name, version_name)) + response = self._api.projects().models().versions().delete(name=name).execute() + if 'name' not in response: + raise Exception('Invalid response from service. "name" is not found.') + _util.wait_for_long_running_operation(response['name']) + + def predict(self, version_name, data): + """Get prediction results from features instances. + + Args: + version_name: the name of the version used for prediction. + data: typically a list of instance to be submitted for prediction. The format of the + instance depends on the model. For example, structured data model may require + a csv line for each instance. + Note that online prediction only works on models that take one placeholder value, + such as a string encoding a csv line. + Returns: + A list of prediction results for given instances. Each element is a dictionary representing + output mapping from the graph. + An example: + [{"predictions": 1, "score": [0.00078, 0.71406, 0.28515]}, + {"predictions": 1, "score": [0.00244, 0.99634, 0.00121]}] + """ + full_version_name = ('%s/versions/%s' % (self._full_model_name, version_name)) + request = self._api.projects().predict(body={'instances': data}, + name=full_version_name) + request.headers['user-agent'] = 'GoogleCloudDataLab/1.0' + result = request.execute() + if 'predictions' not in result: + raise Exception('Invalid response from service. Cannot find "predictions" in response.') + + return result['predictions'] + + def describe(self, version_name): + """Print information of a specified model. + + Args: + version: the name of the version in short form, such as "v1". + """ + version_yaml = yaml.safe_dump(self.get_version_details(version_name), + default_flow_style=False) + print version_yaml + + def list(self): + """List versions under the current model in a table view. + + Raises: + Exception if it is called in a non-IPython environment. + """ + import IPython + + # "self" is iterable (see __iter__() method). + data = [{'name': version['name'].split()[-1], + 'deploymentUri': version['deploymentUri'], 'createTime': version['createTime']} + for version in self] + IPython.display.display( + datalab.utils.commands.render_dictionary(data, ['name', 'deploymentUri', 'createTime'])) diff --git a/datalab/ml/_cloud_training_config.py b/datalab/ml/_cloud_training_config.py new file mode 100644 index 000000000..9fcfddb89 --- /dev/null +++ b/datalab/ml/_cloud_training_config.py @@ -0,0 +1,47 @@ +# Copyright 2017 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import namedtuple + +_CloudTrainingConfig = namedtuple("CloudConfig", + ['region', 'scale_tier', 'master_type', 'worker_type', + 'parameter_server_type', 'worker_count', 'parameter_server_count']) +_CloudTrainingConfig.__new__.__defaults__ = ('BASIC', None, None, None, None, None) + + +class CloudTrainingConfig(_CloudTrainingConfig): + """A config namedtuple containing cloud specific configurations for CloudML training. + + Fields: + region: the region of the training job to be submitted. For example, "us-central1". + Run "gcloud compute regions list" to get a list of regions. + scale_tier: Specifies the machine types, the number of replicas for workers and + parameter servers. For example, "STANDARD_1". See + https://cloud.google.com/ml/reference/rest/v1beta1/projects.jobs#scaletier + for list of accepted values. + master_type: specifies the type of virtual machine to use for your training + job's master worker. Must set this value when scale_tier is set to CUSTOM. + See the link in "scale_tier". + worker_type: specifies the type of virtual machine to use for your training + job's worker nodes. Must set this value when scale_tier is set to CUSTOM. + parameter_server_type: specifies the type of virtual machine to use for your training + job's parameter server. Must set this value when scale_tier is set to CUSTOM. + worker_count: the number of worker replicas to use for the training job. Each + replica in the cluster will be of the type specified in "worker_type". + Must set this value when scale_tier is set to CUSTOM. + parameter_server_count: the number of parameter server replicas to use. Each + replica in the cluster will be of the type specified in "parameter_server_type". + Must set this value when scale_tier is set to CUSTOM. + """ + pass diff --git a/datalab/ml/_confusion_matrix.py b/datalab/ml/_confusion_matrix.py new file mode 100644 index 000000000..c7043943d --- /dev/null +++ b/datalab/ml/_confusion_matrix.py @@ -0,0 +1,111 @@ +# Copyright 2016 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. + + +import numpy as np +import json +import matplotlib.pyplot as plt +import pandas as pd +from sklearn.metrics import confusion_matrix + +import datalab.bigquery as bq +import datalab.data as data + +from . import _util + + +class ConfusionMatrix(object): + """Represents a confusion matrix.""" + + def __init__(self, cm, labels): + """ + Args: + cm: a 2-dimensional matrix with row index being target, column index being predicted, + and values being count. + labels: the labels whose order matches the row/column indexes. + """ + self._cm = cm + self._labels = labels + + @staticmethod + def from_csv(input_csv, headers=None, schema_file=None): + """Create a ConfusionMatrix from a csv file. + Args: + input_csv: Path to a Csv file (with no header). Can be local or GCS path. + headers: Csv headers. If present, it must include 'target' and 'predicted'. + schema_file: Path to a JSON file containing BigQuery schema. Used if "headers" is None. + If present, it must include 'target' and 'predicted' columns. + Returns: + A ConfusionMatrix that can be plotted. + Raises: + ValueError if both headers and schema_file are None, or it does not include 'target' + or 'predicted' columns. + """ + + if headers is not None: + names = headers + elif schema_file is not None: + with _util.open_local_or_gcs(schema_file, mode='r') as f: + schema = json.load(f) + names = [x['name'] for x in schema] + else: + raise ValueError('Either headers or schema_file is needed') + with _util.open_local_or_gcs(input_csv, mode='r') as f: + df = pd.read_csv(f, names=names) + if 'target' not in df or 'predicted' not in df: + raise ValueError('Cannot find "target" or "predicted" column') + + labels = sorted(set(df['target']) | set(df['predicted'])) + cm = confusion_matrix(df['target'], df['predicted'], labels=labels) + return ConfusionMatrix(cm, labels) + + @staticmethod + def from_bigquery(sql): + """Create a ConfusionMatrix from a BigQuery table or query. + Args: + sql: Can be one of: + A SQL query string. + A SQL Query module defined with '%%sql --name [module_name]'. + A Bigquery table. + The query results or table must include "target", "predicted" columns. + Returns: + A ConfusionMatrix that can be plotted. + Raises: + ValueError if query results or table does not include 'target' or 'predicted' columns. + """ + + query, _ = data.SqlModule.get_sql_statement_with_environment(sql, {}) + sql = ('SELECT target, predicted, count(*) as count FROM (%s) group by target, predicted' + % query.sql) + df = bq.Query(sql).results().to_dataframe() + labels = sorted(set(df['target']) | set(df['predicted'])) + labels_count = len(labels) + df['target'] = [labels.index(x) for x in df['target']] + df['predicted'] = [labels.index(x) for x in df['predicted']] + cm = [[0]*labels_count for i in range(labels_count)] + for index, row in df.iterrows(): + cm[row['target']][row['predicted']] = row['count'] + return ConfusionMatrix(cm, labels) + + def plot(self): + """Plot the confusion matrix.""" + + plt.imshow(self._cm, interpolation='nearest', cmap=plt.cm.Blues) + plt.title('Confusion matrix') + plt.colorbar() + tick_marks = np.arange(len(self._labels)) + plt.xticks(tick_marks, self._labels, rotation=45) + plt.yticks(tick_marks, self._labels) + plt.tight_layout() + plt.ylabel('True label') + plt.xlabel('Predicted label') + diff --git a/datalab/ml/_dataset.py b/datalab/ml/_dataset.py new file mode 100644 index 000000000..434382b50 --- /dev/null +++ b/datalab/ml/_dataset.py @@ -0,0 +1,187 @@ +# Copyright 2017 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. + + +"""Implements DataSets that serve two purposes: + +1. Recommended way to pass data source to ML packages. +2. All DataSets can be sampled into dataframe for analysis/visualization. +""" + +import json +import numpy as np +import pandas as pd +import random + +import datalab.bigquery as bq +import datalab.data + +from . import _util + + +class CsvDataSet(object): + """DataSet based on CSV files and schema.""" + + def __init__(self, file_pattern, schema=None, schema_file=None): + """ + Args: + file_pattern: A list of CSV files. or a string. Can contain wildcards in + file names. Can be local or GCS path. + schema: A BigQuery schema object in the form of + [{'name': 'col1', 'type': 'STRING'}, + {'name': 'col2', 'type': 'INTEGER'}] + or a single string in of the form 'col1:STRING,col2:INTEGER,col3:FLOAT'. + schema_file: A JSON serialized schema file. If schema is None, it will try to load from + schema_file if not None. + Raise: + ValueError if both schema and schema_file are None. + """ + if schema is None and schema_file is None: + raise ValueError('schema and schema_file cannot both be None.') + + if schema is not None: + if isinstance(schema, list): + self._schema = schema + else: + self._schema = [] + for x in schema.split(','): + parts = x.split(':') + if len(parts) != 2: + raise ValueError('invalid schema string "%s"' % x) + self._schema.append({'name': parts[0].strip(), 'type': parts[1].strip()}) + else: + with _util.open_local_or_gcs(schema_file, 'r') as f: + self._schema = json.load(f) + + if isinstance(file_pattern, basestring): + file_pattern = [file_pattern] + self._input_files = file_pattern + + self._glob_files = [] + + + @property + def input_files(self): + """Returns the file list that was given to this class without globing files.""" + return self._input_files + + @property + def files(self): + if not self._glob_files: + for file in self._input_files: + # glob_files() returns unicode strings which doesn't make DataFlow happy. So str(). + self._glob_files += [str(x) for x in _util.glob_files(file)] + + return self._glob_files + + @property + def schema(self): + return self._schema + + def sample(self, n): + """ Samples data into a Pandas DataFrame. + Args: + n: number of sampled counts. + Returns: + A dataframe containing sampled data. + Raises: + Exception if n is larger than number of rows. + """ + row_total_count = 0 + row_counts = [] + for file in self.files: + with _util.open_local_or_gcs(file, 'r') as f: + num_lines = sum(1 for line in f) + row_total_count += num_lines + row_counts.append(num_lines) + + names = None + dtype = None + if self._schema: + _MAPPINGS = { + 'FLOAT': np.float64, + 'INTEGER': np.int64, + 'TIMESTAMP': np.datetime64, + 'BOOLEAN': np.bool, + } + names = [x['name'] for x in self._schema] + dtype = {x['name']: _MAPPINGS.get(x['type'], object) for x in self._schema} + + skip_count = row_total_count - n + # Get all skipped indexes. These will be distributed into each file. + # Note that random.sample will raise Exception if skip_count is greater than rows count. + skip_all = sorted(random.sample(xrange(0, row_total_count), skip_count)) + dfs = [] + for file, row_count in zip(self.files, row_counts): + skip = [x for x in skip_all if x < row_count] + skip_all = [x - row_count for x in skip_all if x >= row_count] + with _util.open_local_or_gcs(file, 'r') as f: + dfs.append(pd.read_csv(f, skiprows=skip, names=names, dtype=dtype, header=None)) + return pd.concat(dfs, axis=0, ignore_index=True) + + +class BigQueryDataSet(object): + """DataSet based on BigQuery table or query.""" + + def __init__(self, sql=None, table=None): + """ + Args: + sql: A SQL query string, or a SQL Query module defined with '%%sql --name [module_name]' + table: A table name in the form of "dataset:table". + Raises: + ValueError if both sql and table are set, or both are None. + """ + if (sql is None and table is None) or (sql is not None and table is not None): + raise ValueError('One and only one of sql and table should be set.') + + self._query = None + self._table = None + if sql is not None: + query, _ = datalab.data.SqlModule.get_sql_statement_with_environment(sql, {}) + self._query = query.sql + if table is not None: + self._table = table + self._schema = None + + @property + def query(self): + return self._query + + @property + def table(self): + return self._table + + @property + def schema(self): + if self._schema is None: + source = self._query or self._table + self._schema = bq.Query('SELECT * FROM (%s) LIMIT 1' % source).results().schema + return self._schema + + def sample(self, n): + """Samples data into a Pandas DataFrame. Note that it calls BigQuery so it will + incur cost. + Args: + n: number of sampled counts. Note that the number of counts returned is approximated. + Returns: + A dataframe containing sampled data. + Raises: + Exception if n is larger than number of rows. + """ + source = self._query or self._table + total = bq.Query('select count(*) from (%s)' % source).results()[0].values()[0] + if n > total: + raise ValueError('sample larger than population') + sampling = bq.Sampling.random(n*100.0/float(total)) + sample = bq.Query(source).sample(sampling=sampling) + df = sample.to_dataframe() + return df diff --git a/datalab/ml/_feature_slice_view.py b/datalab/ml/_feature_slice_view.py new file mode 100644 index 000000000..474b7843e --- /dev/null +++ b/datalab/ml/_feature_slice_view.py @@ -0,0 +1,87 @@ +# Copyright 2017 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. + +import json +import pandas as pd +from types import ModuleType + +import datalab.data +import datalab.utils + + +class FeatureSliceView(object): + """Represents A feature slice view.""" + + def _get_lantern_format(self, df): + """ Feature slice view browser expects data in the format of: + {"metricValues": {"count": 12, "accuracy": 1.0}, "feature": "species:Iris-setosa"} + {"metricValues": {"count": 11, "accuracy": 0.72}, "feature": "species:Iris-versicolor"} + ... + This function converts a DataFrame to such format. + """ + + if ('count' not in df) or ('feature' not in df): + raise Exception('No "count" or "feature" found in data.') + if len(df.columns) < 3: + raise Exception('Need at least one metrics column.') + if len(df) == 0: + raise Exception('Data is empty') + + data = [] + for _, row in df.iterrows(): + metric_values = dict(row) + feature = metric_values.pop('feature') + data.append({'feature': feature, 'metricValues': metric_values}) + return data + + def plot(self, data): + """ Plots a featire slice view on given data. + + Args: + data: Can be one of: + A string of sql query. + A sql query module defined by "%%sql --module module_name". + A pandas DataFrame. + Regardless of data type, it must include the following columns: + "feature": identifies a slice of features. For example: "petal_length:4.0-4.2". + "count": number of instances in that slice of features. + All other columns are viewed as metrics for its feature slice. At least one is required. + """ + import IPython + + if isinstance(data, ModuleType) or isinstance(data, basestring): + item, _ = datalab.data.SqlModule.get_sql_statement_with_environment(data, {}) + query = datalab.bigquery.Query(item) + df = query.results().to_dataframe() + data = self._get_lantern_format(df) + elif isinstance(data, pd.core.frame.DataFrame): + data = self._get_lantern_format(data) + else: + raise Exception('data needs to be a sql query, or a pandas DataFrame.') + + HTML_TEMPLATE = """ + + """ + # Serialize the data and list of metrics names to JSON string. + metrics_str = str(map(str, data[0]['metricValues'].keys())) + data_str = str([{str(k): json.dumps(v) for k,v in elem.iteritems()} for elem in data]) + html_id = 'l' + datalab.utils.commands.Html.next_id() + html = HTML_TEMPLATE.format(html_id=html_id, metrics=metrics_str, data=data_str) + IPython.display.display(IPython.display.HTML(html)) + diff --git a/datalab/ml/_job.py b/datalab/ml/_job.py new file mode 100644 index 000000000..8dea01231 --- /dev/null +++ b/datalab/ml/_job.py @@ -0,0 +1,146 @@ +# Copyright 2016 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. + +"""Implements Cloud ML Operation wrapper.""" + + +import datalab.utils +import datalab.context +from googleapiclient import discovery +import yaml + + +class Job(object): + """Represents a Cloud ML job.""" + + def __init__(self, name, context=None): + """Initializes an instance of a CloudML Job. + + Args: + name: the name of the job. It can be an operation full name + ("projects/[project_id]/jobs/[operation_name]") or just [operation_name]. + context: an optional Context object providing project_id and credentials. + """ + if context is None: + context = datalab.context.Context.default() + self._context = context + self._api = discovery.build('ml', 'v1', credentials=self._context.credentials) + if not name.startswith('projects/'): + name = 'projects/' + self._context.project_id + '/jobs/' + name + self._name = name + self.refresh() + + @property + def info(self): + return self._info + + def refresh(self): + """ Refresh the job info. """ + self._info = self._api.projects().jobs().get(name=self._name).execute() + + def describe(self): + job_yaml = yaml.safe_dump(self._info, default_flow_style=False) + print job_yaml + + @staticmethod + def submit_training(job_request, job_id=None): + """Submit a training job. + + Args: + job_request: the arguments of the training job in a dict. For example, + { + 'package_uris': 'gs://my-bucket/iris/trainer-0.1.tar.gz', + 'python_module': 'trainer.task', + 'scale_tier': 'BASIC', + 'region': 'us-central1', + 'args': { + 'train_data_paths': ['gs://mubucket/data/features_train'], + 'eval_data_paths': ['gs://mubucket/data/features_eval'], + 'metadata_path': 'gs://mubucket/data/metadata.yaml', + 'output_path': 'gs://mubucket/data/mymodel/', + } + } + If 'args' is present in job_request and is a dict, it will be expanded to + --key value or --key list_item_0 --key list_item_1, ... + job_id: id for the training job. If None, an id based on timestamp will be generated. + Returns: + A Job object representing the cloud training job. + """ + new_job_request = dict(job_request) + # convert job_args from dict to list as service required. + if 'args' in job_request and isinstance(job_request['args'], dict): + job_args = job_request['args'] + args = [] + for k,v in job_args.iteritems(): + if isinstance(v, list): + for item in v: + args.append('--' + str(k)) + args.append(str(item)) + else: + args.append('--' + str(k)) + args.append(str(v)) + new_job_request['args'] = args + + if job_id is None: + job_id = datetime.datetime.now().strftime('%y%m%d_%H%M%S') + if 'python_module' in new_job_request: + job_id = new_job_request['python_module'].replace('.', '_') + \ + '_' + job_id + + job = { + 'job_id': job_id, + 'training_input': new_job_request, + } + context = datalab.context.Context.default() + cloudml = discovery.build('ml', 'v1', credentials=context.credentials) + request = cloudml.projects().jobs().create(body=job, + parent='projects/' + context.project_id) + request.headers['user-agent'] = 'GoogleCloudDataLab/1.0' + request.execute() + return Job(job_id) + + +class Jobs(object): + """Represents a list of Cloud ML jobs for a project.""" + + def __init__(self, filter=None): + """Initializes an instance of a CloudML Job list that is iteratable ("for job in jobs()"). + + Args: + filter: filter string for retrieving jobs, such as "state=FAILED" + context: an optional Context object providing project_id and credentials. + api: an optional CloudML API client. + """ + self._filter = filter + self._context = datalab.context.Context.default() + self._api = discovery.build('ml', 'v1', credentials=self._context.credentials) + + def _retrieve_jobs(self, page_token, page_size): + list_info = self._api.projects().jobs().list(parent='projects/' + self._context.project_id, + pageToken=page_token, pageSize=page_size, + filter=self._filter).execute() + jobs = list_info.get('jobs', []) + page_token = list_info.get('nextPageToken', None) + return jobs, page_token + + def get_iterator(self): + """Get iterator of jobs so it can be used as "for model in Jobs().get_iterator()". + """ + return iter(datalab.utils.Iterator(self._retrieve_jobs)) + + def list(self, count=10): + import IPython + data = [{'Id': job['jobId'], 'State': job.get('state', 'UNKNOWN'), + 'createTime': job['createTime']} + for _, job in zip(range(count), self)] + IPython.display.display( + datalab.utils.commands.render_dictionary(data, ['Id', 'State', 'createTime'])) diff --git a/datalab/ml/_summary.py b/datalab/ml/_summary.py new file mode 100644 index 000000000..bf18fc1c5 --- /dev/null +++ b/datalab/ml/_summary.py @@ -0,0 +1,156 @@ +# Copyright 2017 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + +import datetime +import fnmatch +import glob +import matplotlib.pyplot as plt +import os +import pandas as pd +from tensorflow.core.util import event_pb2 +from tensorflow.python.lib.io import tf_record + +from . import _util + + +class Summary(object): + """Represents TensorFlow summary events from files under specified directories.""" + + def __init__(self, paths): + """Initializes an instance of a Summary. + + Args: + path: a list of paths to directories which hold TensorFlow events files. + Can be local path or GCS paths. Wild cards allowed. + """ + self._paths = [paths] if isinstance(paths, basestring) else paths + + def _glob_events_files(self, paths): + event_files = [] + for path in paths: + if path.startswith('gs://'): + event_files += _util.glob_files(os.path.join(path, '*.tfevents.*')) + else: + dirs = _util.glob_files(path) + for dir in dirs: + for root, _, filenames in os.walk(dir): + for filename in fnmatch.filter(filenames, '*.tfevents.*'): + event_files.append(os.path.join(root, filename)) + return event_files + + def list_events(self): + """List all scalar events in the directory. + + Returns: + A dictionary. Key is the name of a event. Value is a set of dirs that contain that event. + """ + event_dir_dict = {} + for event_file in self._glob_events_files(self._paths): + dir = os.path.dirname(event_file) + try: + for record in tf_record.tf_record_iterator(event_file): + event = event_pb2.Event.FromString(record) + if event.summary is None or event.summary.value is None: + continue + for value in event.summary.value: + if value.simple_value is None or value.tag is None: + continue + if not value.tag in event_dir_dict: + event_dir_dict[value.tag] = set() + event_dir_dict[value.tag].add(dir) + except: + # It seems current TF (1.0) has a bug when iterating events from a file near the end. + # For now just catch and pass. + # print('Error in iterating events from file ' + event_file) + continue + return event_dir_dict + + + def get_events(self, event_names): + """Get all events as pandas DataFrames given a list of names. + + Args: + event_names: A list of events to get. + + Returns: + A list with the same length as event_names. Each element is a dictionary + {dir1: DataFrame1, dir2: DataFrame2, ...}. + Multiple directories may contain events with the same name, but they are different + events (i.e. 'loss' under trains_set/, and 'loss' under eval_set/.) + """ + event_names = [event_names] if isinstance(event_names, basestring) else event_names + + all_events = self.list_events() + dirs_to_look = set() + for event, dirs in all_events.iteritems(): + if event in event_names: + dirs_to_look.update(dirs) + + ret_events = [dict() for i in range(len(event_names))] + for dir in dirs_to_look: + for event_file in self._glob_events_files([dir]): + try: + for record in tf_record.tf_record_iterator(event_file): + event = event_pb2.Event.FromString(record) + if event.summary is None or event.wall_time is None or event.summary.value is None: + continue + + event_time = datetime.datetime.fromtimestamp(event.wall_time) + for value in event.summary.value: + if value.tag not in event_names or value.simple_value is None: + continue + + index = event_names.index(value.tag) + dir_event_dict = ret_events[index] + if dir not in dir_event_dict: + dir_event_dict[dir] = pd.DataFrame( + [[event_time, event.step, value.simple_value]], + columns=['time', 'step', 'value']) + else: + df = dir_event_dict[dir] + # Append a row. + df.loc[len(df)] = [event_time, event.step, value.simple_value] + except: + # It seems current TF (1.0) has a bug when iterating events from a file near the end. + # For now just catch and pass. + # print('Error in iterating events from file ' + event_file) + continue + + for dir_event_dict in ret_events: + for df in dir_event_dict.values(): + df.sort_values(by=['time'], inplace=True) + + return ret_events + + def plot(self, event_names, x_axis='step'): + """Plots a list of events. Each event (a dir+event_name) is represetented as a line + in the graph. + + Args: + event_names: A list of events to plot. Each event_name may correspond to multiple events, + each in a different directory. + x_axis: whether to use step or time as x axis. + """ + event_names = [event_names] if isinstance(event_names, basestring) else event_names + events_list = self.get_events(event_names) + for event_name, dir_event_dict in zip(event_names, events_list): + for dir, df in dir_event_dict.iteritems(): + label = event_name + ':' + dir + x_column = df['step'] if x_axis == 'step' else df['time'] + plt.plot(x_column, df['value'], label=label) + plt.legend(loc='best') + plt.show() + diff --git a/datalab/mlalpha/_tensorboard.py b/datalab/ml/_tensorboard.py similarity index 78% rename from datalab/mlalpha/_tensorboard.py rename to datalab/ml/_tensorboard.py index 194a194b6..89956b360 100644 --- a/datalab/mlalpha/_tensorboard.py +++ b/datalab/ml/_tensorboard.py @@ -1,4 +1,4 @@ -# Copyright 2016 Google Inc. All rights reserved. +# Copyright 2017 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -11,6 +11,11 @@ # the License. +try: + import IPython +except ImportError: + raise Exception('This module can only be loaded in ipython.') + import argparse import psutil import subprocess @@ -20,15 +25,13 @@ import datalab.storage -class TensorBoardManager(object): +class TensorBoard(object): """Start, shutdown, and list TensorBoard instances. """ @staticmethod - def get_running_list(): + def list(): """List running TensorBoard instances. - - Returns: A list of {'pid': pid, 'logdir': logdir, 'port': port} """ running_list = [] parser = argparse.ArgumentParser() @@ -41,27 +44,16 @@ def get_running_list(): del cmd_args[0:2] # remove 'python' and 'tensorboard' args = parser.parse_args(cmd_args) running_list.append({'pid': p.pid, 'logdir': args.logdir, 'port': args.port}) - return running_list - - @staticmethod - def get_reverse_proxy_url(port): - """Get the reverse proxy url. Note that this URL only works with - Datalab web server which supports reverse proxy. - - Args: - port: the port of the tensorboard instance. - Returns: the reverse proxy URL. - """ - return '/_proxy/%d/' % port + IPython.display.display(datalab.utils.commands.render_dictionary( + running_list, ['pid', 'logdir', 'port'])) + @staticmethod def start(logdir): """Start a TensorBoard instance. Args: logdir: the logdir to run TensorBoard on. - Returns: - A tuple. First is the pid of the instance. Second is the port used. Raises: Exception if the instance cannot be started. """ @@ -77,7 +69,11 @@ def start(logdir): retry = 5 while (retry > 0): if datalab.utils.is_http_running_on(port): - return p.pid, port + url = '/_proxy/%d/' % port + html = '

TensorBoard was started successfully with pid %d. ' % p.pid + html += 'Click here to access it.

' % url + IPython.display.display_html(html, raw=True) + return time.sleep(1) retry -= 1 diff --git a/datalab/ml/_util.py b/datalab/ml/_util.py new file mode 100644 index 000000000..5db1c8245 --- /dev/null +++ b/datalab/ml/_util.py @@ -0,0 +1,109 @@ +# Copyright 2017 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from apache_beam.io import gcsio +import datetime +import glob +from googleapiclient import discovery +import os +import shutil +import subprocess +import tempfile +import time + +import datalab.context + +# TODO: Create an Operation class. +def wait_for_long_running_operation(operation_full_name): + print('Waiting for operation "%s"' % operation_full_name) + api = discovery.build('ml', 'v1', credentials=datalab.context.Context.default().credentials) + while True: + response = api.projects().operations().get(name=operation_full_name).execute() + if 'done' not in response or response['done'] != True: + time.sleep(3) + else: + if 'error' in response: + print(response['error']) + else: + print('Done.') + break + + +def package_and_copy(package_root_dir, setup_py, output_tar_path): + """Repackage an CloudML package and copy it to a staging dir. + + Args: + package_root_dir: the root dir to install package from. Usually you can get the path + from inside your module using a relative path to __file__. + setup_py: the path to setup.py. + output_tar_path: the GCS path of the output tarball package. + Raises: + ValueError if output_tar_path is not a GCS path, or setup_py does not exist. + """ + if not output_tar_path.startswith('gs://'): + raise ValueError('output_tar_path needs to be a GCS path.') + if not os.path.isfile(setup_py): + raise ValueError('Supplied file "%s" does not exist.' % setup_py) + + dest_setup_py = os.path.join(package_root_dir, 'setup.py') + # setuptools requires a "setup.py" in the current dir, so copy setup.py there. + # Also check if there is an existing setup.py. If so, back it up. + if os.path.isfile(dest_setup_py): + os.rename(dest_setup_py, dest_setup_py + '._bak_') + shutil.copyfile(setup_py, dest_setup_py) + + tempdir = tempfile.mkdtemp() + previous_cwd = os.getcwd() + os.chdir(package_root_dir) + try: + # Repackage. + sdist = ['python', dest_setup_py, 'sdist', '--format=gztar', '-d', tempdir] + subprocess.check_call(sdist) + + # Copy to GCS. + source = os.path.join(tempdir, '*.tar.gz') + gscopy = ['gsutil', 'cp', source, output_tar_path] + subprocess.check_call(gscopy) + return + finally: + os.chdir(previous_cwd) + os.remove(dest_setup_py) + if os.path.isfile(dest_setup_py + '._bak_'): + os.rename(dest_setup_py + '._bak_', dest_setup_py) + shutil.rmtree(tempdir) + + +def open_local_or_gcs(path, mode): + """Opens the given path.""" + + if path.startswith('gs://'): + try: + return gcsio.GcsIO().open(path, mode) + except Exception as e: # pylint: disable=broad-except + # Currently we retry exactly once, to work around flaky gcs calls. + logging.error('Retrying after exception reading gcs file: %s', e) + time.sleep(10) + return gcsio.GcsIO().open(path, mode) + else: + return open(path, mode) + + +def glob_files(path): + """Glob the given path.""" + + if path.startswith('gs://'): + return gcsio.GcsIO().glob(path) + else: + return glob.glob(path) diff --git a/datalab/mlalpha/_cloud_models.py b/datalab/mlalpha/_cloud_models.py deleted file mode 100644 index 645024415..000000000 --- a/datalab/mlalpha/_cloud_models.py +++ /dev/null @@ -1,201 +0,0 @@ -# Copyright 2016 Google Inc. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. - -"""Implements Cloud ML Model Operations""" - -from googleapiclient import discovery -import os -import time - -import datalab.context -import datalab.storage -import datalab.utils - - -# TODO(qimingj) Remove once the API is public since it will no longer be needed -_CLOUDML_DISCOVERY_URL = 'https://storage.googleapis.com/cloud-ml/discovery/' \ - 'ml_v1beta1_discovery.json' - - -class CloudModels(object): - """Represents a list of Cloud ML models for a project.""" - - def __init__(self, project_id=None, credentials=None, api=None): - """Initializes an instance of a CloudML Model list that is iteratable - ("for model in CloudModels()"). - - Args: - project_id: project_id of the models. If not provided, default project_id will be used. - credentials: credentials used to talk to CloudML service. If not provided, default credentials - will be used. - api: an optional CloudML API client. - """ - if project_id is None: - project_id = datalab.context.Context.default().project_id - self._project_id = project_id - if credentials is None: - credentials = datalab.context.Context.default().credentials - self._credentials = credentials - if api is None: - api = discovery.build('ml', 'v1beta1', credentials=self._credentials, - discoveryServiceUrl=_CLOUDML_DISCOVERY_URL) - self._api = api - - def _retrieve_models(self, page_token, page_size): - list_info = self._api.projects().models().list(parent='projects/' + self._project_id, - pageToken=page_token, pageSize=page_size).execute() - models = list_info.get('models', []) - page_token = list_info.get('nextPageToken', None) - return models, page_token - - def __iter__(self): - return iter(datalab.utils.Iterator(self._retrieve_models)) - - def get(self, model_name): - """Get details of a model. - - Args: - model_name: the name of the model. It can be a model full name - ("projects/[project_id]/models/[model_name]") or just [model_name]. - Returns: a dictionary of the model details. - """ - full_name = model_name - if not model_name.startswith('projects/'): - full_name = ('projects/%s/models/%s' % (self._project_id, model_name)) - return self._api.projects().models().get(name=full_name).execute() - - def create(self, model_name): - """Create a model. - - Args: - model_name: the short name of the model, such as "iris". - """ - body = {'name': model_name} - parent = 'projects/' + self._project_id - self._api.projects().models().create(body=body, parent=parent).execute() - - def delete(self, model_name): - """Delete a model. - - Args: - model_name: the name of the model. It can be a model full name - ("projects/[project_id]/models/[model_name]") or just [model_name]. - """ - full_name = model_name - if not model_name.startswith('projects/'): - full_name = ('projects/%s/models/%s' % (self._project_id, model_name)) - return self._api.projects().models().delete(name=full_name).execute() - - -class CloudModelVersions(object): - """Represents a list of versions for a Cloud ML model.""" - - def __init__(self, model_name, project_id=None, credentials=None, api=None): - """Initializes an instance of a CloudML model version list that is iteratable - ("for version in CloudModelVersions()"). - - Args: - model_name: the name of the model. It can be a model full name - ("projects/[project_id]/models/[model_name]") or just [model_name]. - project_id: project_id of the models. If not provided and model_name is not a full name - (not including project_id), default project_id will be used. - credentials: credentials used to talk to CloudML service. If not provided, default - credentials will be used. - api: an optional CloudML API client. - """ - if project_id is None: - project_id = datalab.context.Context.default().project_id - self._project_id = project_id - if credentials is None: - credentials = datalab.context.Context.default().credentials - self._credentials = credentials - if api is None: - api = discovery.build('ml', 'v1alpha3', credentials=self._credentials, - discoveryServiceUrl=_CLOUDML_DISCOVERY_URL) - self._api = api - if not model_name.startswith('projects/'): - model_name = ('projects/%s/models/%s' % (self._project_id, model_name)) - self._full_model_name = model_name - self._model_name = self._full_model_name.split('/')[-1] - - def _retrieve_versions(self, page_token, page_size): - parent = self._full_model_name - list_info = self._api.projects().models().versions().list(parent=parent, - pageToken=page_token, pageSize=page_size).execute() - versions = list_info.get('versions', []) - page_token = list_info.get('nextPageToken', None) - return versions, page_token - - def __iter__(self): - return iter(datalab.utils.Iterator(self._retrieve_versions)) - - def get(self, version_name): - """Get details of a version. - - Args: - version: the name of the version in short form, such as "v1". - Returns: a dictionary containing the version details. - """ - name = ('%s/versions/%s' % (self._full_model_name, version_name)) - return self._api.projects().models().versions().get(name=name).execute() - - def _wait_for_long_running_operation(self, response): - if 'name' not in response: - raise Exception('Invaid response from service. Cannot find "name" field.') - while True: - response = self._api.projects().operations().get(name=response['name']).execute() - if 'done' not in response or response['done'] != True: - time.sleep(3) - else: - if 'error' in response: - print response['error'] - break - - def deploy(self, version_name, path): - """Deploy a model version to the cloud. - - Args: - version_name: the name of the version in short form, such as "v1". - path: the Google Cloud Storage path (gs://...) which contains the model files. - - Raises: Exception if the path is invalid or does not contain expected files. - Exception if the service returns invalid response. - """ - if not path.startswith('gs://'): - raise Exception('Invalid path. Only Google Cloud Storage path (gs://...) is accepted.') - if not datalab.storage.Item.from_url(os.path.join(path, 'export.meta')).exists(): - raise Exception('Cannot find export.meta from given path.') - - body = {'name': self._model_name} - parent = 'projects/' + self._project_id - try: - self._api.projects().models().create(body=body, parent=parent).execute() - except: - # Trying to create an already existing model gets an error. Ignore it. - pass - body = { - 'name': version_name, - 'deployment_uri': path, - } - response = self._api.projects().models().versions().create(body=body, - parent=self._full_model_name).execute() - self._wait_for_long_running_operation(response) - - def delete(self, version_name): - """Delete a version of model. - - Args: - version_name: the name of the version in short form, such as "v1". - """ - name = ('%s/versions/%s' % (self._full_model_name, version_name)) - response = self._api.projects().models().versions().delete(name=name).execute() - self._wait_for_long_running_operation(response) diff --git a/datalab/mlalpha/_cloud_predictor.py b/datalab/mlalpha/_cloud_predictor.py deleted file mode 100644 index 8209d77a1..000000000 --- a/datalab/mlalpha/_cloud_predictor.py +++ /dev/null @@ -1,89 +0,0 @@ -# Copyright 2016 Google Inc. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. - - -from googleapiclient import discovery -import pandas as pd - -import datalab.context -import datalab.utils - - -# TODO(qimingj) Remove once the API is public since it will no longer be needed -_CLOUDML_DISCOVERY_URL = 'https://storage.googleapis.com/cloud-ml/discovery/' \ - 'ml_v1beta1_discovery.json' - - -class CloudPredictor(object): - """Preforms cloud predictions on given data.""" - - # TODO: Either remove label_output, or add code to load metadata from model dir and - # transform integer to label. Depending on whether online prediction returns label or not. - def __init__(self, model_name, version_name, label_output=None, - project_id=None, credentials=None, api=None): - """Initializes an instance of a CloudPredictor. - - Args: - model_name: the name of the model used for prediction. - version_name: the name of the version used for prediction. - label_output: the name of the output column where all values should be converted from - index to labels. Only useful in classification. If specified, metadata_path is required. - project_id: project_id of the model. If not provided, default project_id will be used. - credentials: credentials used to talk to CloudML service. If not provided, default - credentials will be used. - api: an optional CloudML API client. - """ - self._model_name = model_name - self._version_name = version_name - if project_id is None: - project_id = datalab.context.Context.default().project_id - self._project_id = project_id - if credentials is None: - credentials = datalab.context.Context.default().credentials - self._credentials = credentials - if api is None: - api = discovery.build('ml', 'v1beta1', credentials=self._credentials, - discoveryServiceUrl=_CLOUDML_DISCOVERY_URL) - self._api = api - self._full_version_name = ('projects/%s/models/%s/versions/%s' % - (self._project_id, self._model_name, self._version_name)) - - def predict(self, data): - """Make predictions on given data. - - Args: - data: a list of feature data or a pandas DataFrame. Each element in the list is an instance - which is a dictionary of feature data. - An example: - [{"sepal_length": 4.9, "sepal_width": 2.5, "petal_length": 4.5, "petal_width": 1.7}, - {"sepal_length": 5.7, "sepal_width": 2.8, "petal_length": 4.1, "petal_width": 1.3}] - Returns: - A list of prediction results for given instances. Each element is a dictionary representing - output mapping from the graph. - An example: - [{"predictions": 1, "score": [0.00078, 0.71406, 0.28515]}, - {"predictions": 1, "score": [0.00244, 0.99634, 0.00121]}] - - Raises: Exception if bad response is received from the service - Exception if the prediction result has incorrect label types - """ - if isinstance(data, pd.DataFrame): - data = data.T.to_dict().values() - - request = self._api.projects().predict(body={'instances': data}, - name=self._full_version_name) - request.headers['user-agent'] = 'GoogleCloudDataLab/1.0' - result = request.execute() - if 'predictions' not in result: - raise Exception('Invalid response from service. Cannot find "predictions" in response.') - - return result['predictions'] diff --git a/datalab/mlalpha/_cloud_runner.py b/datalab/mlalpha/_cloud_runner.py deleted file mode 100644 index 5da4958d8..000000000 --- a/datalab/mlalpha/_cloud_runner.py +++ /dev/null @@ -1,90 +0,0 @@ -# Copyright 2016 Google Inc. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. - -import datetime -from googleapiclient import discovery - -import datalab.context - - -# TODO(qimingj) Remove once the API is public since it will no longer be needed -_CLOUDML_DISCOVERY_URL = 'https://storage.googleapis.com/cloud-ml/discovery/' \ - 'ml_v1beta1_discovery.json' - - -class CloudRunner(object): - """CloudML Trainer API Wrapper that takes a job_request, add authentication information, - submit it to cloud, and get job response. - """ - - def __init__(self, job_request): - """Initializes an instance of a LocalRunner - - Args: - job_request: the arguments of the training job in a dict. For example, - { - 'package_uris': 'gs://my-bucket/iris/trainer-0.1.tar.gz', - 'python_module': 'trainer.task', - 'scale_tier': 'BASIC', - 'region': 'us-central1', - 'args': { - 'train_data_paths': ['gs://mubucket/data/features_train'], - 'eval_data_paths': ['gs://mubucket/data/features_eval'], - 'metadata_path': 'gs://mubucket/data/metadata.yaml', - 'output_path': 'gs://mubucket/data/mymodel/', - } - } - """ - - self._job_request = dict(job_request) - # convert job_args from dict to list as service required. - if 'args' in job_request and isinstance(job_request['args'], dict): - job_args = job_request['args'] - args = [] - for k,v in job_args.iteritems(): - if isinstance(v, list): - for item in v: - args.append('--' + k) - args.append(str(item)) - else: - args.append('--' + k) - args.append(str(v)) - self._job_request['args'] = args - - def _create_default_job_name(self): - job_name = datetime.datetime.now().strftime('%y%m%d_%H%M%S') - if 'python_module' in self._job_request: - job_name = self._job_request['python_module'].replace('.', '_') + \ - '_' + job_name - return job_name - - def run(self, job_id=None): - """Submit a training job to the CloudML service. - - Args: - job_id: id for the training job. If None, a UUID will be generated. - - Returns: job info returned from service. - """ - if job_id is None: - job_id = self._create_default_job_name() - job = { - 'job_id': job_id, - 'training_input': self._job_request, - } - context = datalab.context.Context.default() - cloudml = discovery.build('ml', 'v1beta1', credentials=context.credentials, - discoveryServiceUrl=_CLOUDML_DISCOVERY_URL) - request = cloudml.projects().jobs().create(body=job, - parent='projects/' + context.project_id) - request.headers['user-agent'] = 'GoogleCloudDataLab/1.0' - return request.execute() diff --git a/datalab/mlalpha/_confusion_matrix.py b/datalab/mlalpha/_confusion_matrix.py deleted file mode 100644 index 0519a3f05..000000000 --- a/datalab/mlalpha/_confusion_matrix.py +++ /dev/null @@ -1,67 +0,0 @@ -# Copyright 2016 Google Inc. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. - - -from plotly.offline import iplot - - -class ConfusionMatrix(object): - """Represents a confusion matrix.""" - - def __init__(self, predicted_labels, true_labels, counts): - """Initializes an instance of a ComfusionMatrix. the length of predicted_values, - true_values, count must be the same. - - Args: - predicted_labels: a list of predicted labels. - true_labels: a list of true labels. - counts: a list of count for each (predicted, true) combination. - - Raises: Exception if predicted_labels, true_labels, and counts are not of the same size - """ - if len(predicted_labels) != len(true_labels) or len(true_labels) != len(counts): - raise Exception('The input predicted_labels, true_labels, counts need to be same size.') - self._all_labels = list(set(predicted_labels) | set(true_labels)) - data = [] - for value in self._all_labels: - predicts_for_current_true_label = \ - {p: c for p, t, c in zip(predicted_labels, true_labels, counts) if t == value} - # sort by all_values and fill in zeros if needed - predicts_for_current_true_label = [predicts_for_current_true_label.get(v, 0) - for v in self._all_labels] - data.append(predicts_for_current_true_label) - self._data = data - - def plot(self): - """Plot the confusion matrix.""" - figure_data = \ - { - "data": [ - { - "x": self._all_labels, - "y": self._all_labels, - "z": self._data, - "colorscale": "YlGnBu", - "type": "heatmap" - } - ], - "layout": { - "title": "Confusion Matrix", - "xaxis": { - "title": "Predicted value", - }, - "yaxis": { - "title": "True Value", - } - } - } - iplot(figure_data) diff --git a/datalab/mlalpha/_dataset.py b/datalab/mlalpha/_dataset.py deleted file mode 100644 index e9b5b01ef..000000000 --- a/datalab/mlalpha/_dataset.py +++ /dev/null @@ -1,417 +0,0 @@ -# Copyright 2016 Google Inc. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. - - -import google.cloud.ml.features as features -import matplotlib.pyplot as plt -import numpy as np -import os -import pandas as pd -import pandas_profiling -from plotly.graph_objs import Histogram, Scatter, Scatter3d -from plotly.offline import iplot -from plotly import tools -import seaborn as sns -import tempfile - -import datalab.utils - -try: - import IPython.core.display -except ImportError: - raise Exception('This module can only be loaded in ipython.') - - -class DataSet(object): - """Represents a dataset that can be explored through its 'analyze()' function. - The data need to be able to fit in memory. - """ - - def __init__(self, feature_set, data_paths, format='csv'): - """Initializes an instance of DataSet. - - Args: - feature_set: A feature_set describing the data. The feature_set provides data types - (for example, csv), column names, schema, data transformers, etc. - This is the same class used in CloudML preprocessing. - data_paths: A dictionary with {name: path} pair. All data need to have the same schema. - format: the format of the data, currently only 'csv' or 'tsv'. - - Raises: Exception if data_paths is not a dictionary - Exception if the format is not csv or tsv - """ - self._feature_set = feature_set - if not isinstance(data_paths, dict): - raise Exception('Expect "data_paths" to be a dictionary.') - self._data_paths = data_paths - if format == 'csv': - self._delimiter = ',' - elif format=='tsv': - self._delimiter = '\t' - else: - raise Exception('Unsupported format "%s"' % format) - self._dataframes = {} - self._raw_dataframes = {} - self._concatenated_data_frame = None - self._concatenated_raw_data_frame = None - self._target_name = None - self._key_name = None - - def _get_dataframe_type(self, column): - if isinstance(column, features.NumericFeatureColumn): - return np.float64 - if isinstance(column, features.TargetFeatureColumn) and column.is_numeric: - return np.float64 - return str - - def _is_categorical_column(self, column): - if isinstance(column, features.CategoricalFeatureColumn): - return True - if isinstance(column, features.TargetFeatureColumn) and not column.is_numeric: - return True - return False - - def _transform_data(self, df): - df = df.copy(deep=True) - for name, value in type(self._feature_set).__dict__.iteritems(): - for column in (value if (type(value) == list or type(value) == tuple) else [value]): - if self._is_categorical_column(column): - concatenated_column = self._concatenated_raw_data_frame[column.name] - all_categories = concatenated_column.astype('category').cat.categories - df[column.name] = pd.Categorical(df[column.name], categories=all_categories) - if isinstance(column, features.NumericFeatureColumn): - concatenated_column = self._concatenated_raw_data_frame[column.name] - # Simulate metadata so we can create a transformer from CloudML features registry. - transform_info = { - 'type': 'numeric', - 'transform': column.transform, - } - transform_info[column.transform] = column.transform_args - transform_info['max'] = max(concatenated_column) - transform_info['min'] = min(concatenated_column) - transformer = features._registries.transformation_registry \ - .get_transformer(transform_info) - transformed = [transformer.transform(x)[0] for x in df[column.name]] - if column.transform == 'discretize': - # Transformed data contains a one_of_k list so need to convert it back to index. - # Categories needs to be num_of_buckets+2 to match the transformer behavior, - # where it creates a smaller-than-min and a greater-than-max buckets. - df[column.name] = pd.Series(pd.Categorical(transformed, - categories=range(transformer._buckets+2))) - else: - # TODO(qimingj): It is supposed to work with most transformers but still need to - # test them if new transformers become available. - df[column.name] = transformed - return df - - def _load_to_dataframes(self): - if self._concatenated_raw_data_frame is not None: - return # Already loaded. - - # Step 1: Get schema from feature_set class. - schema = {} - for name, value in type(self._feature_set).__dict__.iteritems(): - for column in (value if (type(value) == list or type(value) == tuple) else [value]): - if issubclass(type(column), features.FeatureColumn): - if isinstance(column, features.TargetFeatureColumn): - self._target_name = column.name - if isinstance(column, features.KeyFeatureColumn): - self._key_name = column.name - data_type = self._get_dataframe_type(column) - schema[column.name] = data_type - if self._target_name is None: - raise Exception('No target column found from feature_set') - - # Step 2: Load all non-text data into raw dataframes. - for name, data_path in self._data_paths.iteritems(): - local_file = data_path - if data_path.startswith('gs://'): - local_file = tempfile.mktemp() - datalab.utils.gcs_copy_file(data_path, local_file) - self._raw_dataframes[name] = pd.read_csv(local_file, - names=type(self._feature_set).csv_columns, - dtype=schema, - delimiter=self._delimiter, - skipinitialspace=True) - if data_path.startswith('gs://'): - os.remove(local_file) - self._concatenated_raw_data_frame = pd.concat(self._raw_dataframes.values()) - - # Step 3: Transform the data. - for name, raw_df in self._raw_dataframes.iteritems(): - self._dataframes[name] = self._transform_data(raw_df) - self._concatenated_data_frame = pd.concat(self._dataframes.values()) - - def _get_numeric_values(self, df, column_name): - if str(df[column_name].dtype) == 'category': - return df[column_name].cat.codes - else: - return df[column_name].values - - def _create_dummy_trace(self, x, y): - # Dummy trace is needed for scatter plot to a) set the same x and y ranges across multiple - # subplots, b) the categorical labels are sorted in the same way across multiple subplots - # (the order of the categories depend on the order they appear in the data). - # For a given axis, if it is categorical data, we draw one point for each category. - # If it is numeric data, we draw min and max. Usually on x and y axises we don't have same - # number of points, so we will pad one axis data. - # Note: This needs to go away if plotly python supports setting ranges and specifying - # category order across subplots. - if str(self._concatenated_data_frame[x].dtype) == 'category': - categories = self._concatenated_data_frame[x].cat.categories - x_dummy = list(categories) - else: - x_dummy = [min(self._concatenated_data_frame[x]), max(self._concatenated_data_frame[x])] - if str(self._concatenated_data_frame[y].dtype) == 'category': - categories = self._concatenated_data_frame[y].cat.categories - y_dummy = list(categories) - else: - y_dummy = [min(self._concatenated_data_frame[y]), max(self._concatenated_data_frame[y])] - if len(x_dummy) > len(y_dummy): - y_dummy = y_dummy + [y_dummy[-1]]*(len(x_dummy)-len(y_dummy)) - if len(x_dummy) < len(y_dummy): - x_dummy = x_dummy + [x_dummy[-1]]*(len(y_dummy)-len(x_dummy)) - - scatter_dummy = Scatter( - x=x_dummy, - y=y_dummy, - showlegend=False, - opacity=0, # Make it invisible. - hoverinfo='none', - ) - return scatter_dummy - - def _histogram(self, names, x): - concatenated_numeric_values = self._get_numeric_values(self._concatenated_data_frame, x) - start = min(concatenated_numeric_values) - end = max(concatenated_numeric_values) - size = 1 if str(self._concatenated_data_frame[x].dtype) == 'category' \ - else (max(concatenated_numeric_values) - min(concatenated_numeric_values)) / 10.0 - fig = tools.make_subplots(rows=1, cols=len(names), print_grid=False) - histogram_index = 1 - for name in names: - df = self._dataframes[name] - numeric_values = self._get_numeric_values(df, x) - text = df[x].cat.categories if str(df[x].dtype) == 'category' else None - histogram = Histogram( - name=name, - x=numeric_values, - xbins=dict( - start=start, - end=end, - size=size, - ), - text=text, - ) - fig.append_trace(histogram, 1, histogram_index) - fig.layout['xaxis' + str(histogram_index)].title = x - fig.layout['xaxis' + str(histogram_index)].range = [start, end] - fig.layout['yaxis' + str(histogram_index)].title = 'count' - histogram_index += 1 - fig.layout.width = min(500 * len(names), 1200) - fig.layout.height = 500 - iplot(fig) - - def _scatter_plot(self, names, x, y, color): - showscale = True if str(self._concatenated_data_frame[color].dtype) != 'category' else False - cmin = min(self._get_numeric_values(self._concatenated_data_frame, color)) - cmax = max(self._get_numeric_values(self._concatenated_data_frame, color)) - fig = tools.make_subplots(rows=1, cols=len(names), print_grid=False) - scatter_index = 1 - scatter_dummy = self._create_dummy_trace(x, y) - for name in names: - df = self._dataframes[name] - text = ["x=%s y=%s target=%s" % (str(a),str(b),str(t)) for a,b,t - in zip(df[x], df[y], df[color])] - scatter = Scatter( - name=name, - x=df[x], - y=df[y], - mode='markers', - text=text, - hoverinfo='text', - marker=dict( - color=self._get_numeric_values(df, color), - colorscale='Viridis', - showscale=showscale, - cmin=cmin, - cmax=cmax, - ) - ) - # Add dummy trace to set same ranges and categorical orders on subplots. - fig.append_trace(scatter_dummy, 1, scatter_index) - fig.append_trace(scatter, 1, scatter_index) - fig.layout['xaxis' + str(scatter_index)].title = x - fig.layout['yaxis' + str(scatter_index)].title = y - scatter_index += 1 - fig.layout.width = min(500 * len(names), 1200) - fig.layout.height = 500 - iplot(fig) - - def _scatter3d_plot(self, names, x, y, z, color): - showscale = True if str(self._concatenated_data_frame[color].dtype) != 'category' else False - cmin = min(self._get_numeric_values(self._concatenated_data_frame, color)) - cmax = max(self._get_numeric_values(self._concatenated_data_frame, color)) - specs = [[{'is_3d':True}]*len(self._dataframes)] - fig = tools.make_subplots(rows=1, cols=len(names), specs=specs, print_grid=False) - scatter3d_index = 1 - for name in names: - df = self._dataframes[name] - text = ["x=%s y=%s z=%s, target=%s" % (str(a),str(b),str(c),str(t)) for a,b,c,t - in zip(df[x], df[y], df[z], df[color])] - scatter3d = Scatter3d( - name=name, - x=df[x], - y=df[y], - z=df[z], - mode='markers', - text=text, - hoverinfo='text', - marker=dict( - color=self._get_numeric_values(df, color), - colorscale='Viridis', - showscale=showscale, - cmin=cmin, - cmax=cmax, - ) - ) - fig.append_trace(scatter3d, 1, scatter3d_index) - fig.layout['scene' + str(scatter3d_index)].xaxis.title = x - fig.layout['scene' + str(scatter3d_index)].yaxis.title = y - fig.layout['scene' + str(scatter3d_index)].zaxis.title = z - scatter3d_index += 1 - fig.layout.width = min(500 * len(names), 1200) - fig.layout.height = 500 - iplot(fig) - - def _plot_x(self, names, x): - self._histogram(names, x) - if x != self._target_name: - self._scatter_plot(names, x, self._target_name, self._target_name) - - def _plot_xy(self, names, x, y): - self._scatter_plot(names, x, y, self._target_name) - - def _plot_xyz(self, names, x, y, z): - self._scatter3d_plot(names, x, y, z, self._target_name) - - def profile(self, names=None, columns=None): - """Print profiles of the dataset. - - Args: - names: the names of the data to plot. Such as ['train']. If None, all data in the datasets - will be used. - columns: The list of column names to plot correlations. If None, all numeric columns - will be used. - """ - self._load_to_dataframes() - if names is None: - names = self._raw_dataframes.keys() - html = '' - for name in names: - df = self._raw_dataframes[name] - html += '

' + \ - '%s



' % name - if columns is not None: - df = df[columns] - html += pandas_profiling.ProfileReport(df).html.replace('bootstrap', 'nonexistent') - IPython.core.display.display_html(IPython.core.display.HTML(html)) - - def analyze(self, names=None, columns=None): - """Analyze the data and report results in IPython output cell. The results are based - on preprocessed data as described in feature_set. - - Args: - names: the names of the data to plot. Such as ['train']. If None, all data in the datasets - will be used. - columns: The list of names of columns to analyze. If None, all numeric columns - will be used. - If one column is provided, displays a scatter plot between the column and target - column, and a histogram of the column. - If two columns are provided, displays a scatter plot between them, - colored by target column. - If three columns are provided, displays a 3d scatter plot between them, - colored by target column. - - Raises: - Exception if any column names are not found in the data or the columns are text. - Exception if columns are greater than 3 or less than 1. - """ - self._load_to_dataframes() - if columns is None: - columns = [x for x in self._concatenated_data_frame - if str(self._concatenated_data_frame[x].dtype) != 'object'] - if len(columns) > 3 or len(columns) < 1: - raise 'Found %d columns. ' % len(columns) + \ - 'Use "columns" parameter to specify one, two or three columns.' - for column_name in columns: - if column_name not in self._concatenated_data_frame: - raise Exception('Cannot find column "%s"' % column_name) - if str(self._concatenated_data_frame[column_name].dtype) == 'object': - raise Exception('Cannot analyze text column "%s"' % column_name) - - if names is None: - names = self._dataframes.keys() - if len(columns) == 1: - self._plot_x(names, columns[0]) - elif len(columns) == 2: - self._plot_xy(names, columns[0], columns[1]) - elif len(columns) == 3: - self._plot_xyz(names, columns[0], columns[1], columns[2]) - - def to_dataframes(self): - """Get the transformed data as a DataFrames - - Returns: the transformed data in {name: dataframe} dictionary. - """ - self._load_to_dataframes() - return self._dataframes - - def plot(self, names=None, columns=None): - """Plot correlation graphs on the specified columns, in n*n grids. - - Args: - names: the names of the data to plot. Such as ['train']. If None, all data in the datasets - will be used. - columns: The list of column names to plot correlations. If None, all numeric columns - will be used. - """ - self.to_dataframes() - is_classification = (str(self._concatenated_data_frame[self._target_name].dtype) == 'category') - if columns is not None and self._target_name not in columns and is_classification: - columns.append(self._target_name) - if names is None: - names = self._dataframes.keys() - - for name in names: - df_correlation = self._dataframes[name].copy(deep=True) - if self._key_name is not None: - del df_correlation[self._key_name] - if columns is not None: - df_correlation = df_correlation[columns] - for k in df_correlation.columns: - if str(df_correlation[k].dtype) == 'object' or str(df_correlation[k].dtype) == 'category': - if k != self._target_name: - # pairplot only works with numeric columns - del df_correlation[k] - else: - # pairplot does not deal with missing values well. For now fillna(0). - df_correlation[k] = df_correlation[k].fillna(0) - sns.set(style="ticks", color_codes=True) - if is_classification: - # pairplot doesn't like categories with all numbers - df_correlation[self._target_name] = map(lambda x: 'target ' + str(x), df_correlation[self._target_name]) - sns.pairplot(df_correlation, hue=self._target_name, dropna=True) - else: - sns.pairplot(df_correlation, dropna=True) - plt.suptitle(name) - plt.show() diff --git a/datalab/mlalpha/_job.py b/datalab/mlalpha/_job.py deleted file mode 100644 index 58d45b474..000000000 --- a/datalab/mlalpha/_job.py +++ /dev/null @@ -1,95 +0,0 @@ -# Copyright 2016 Google Inc. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. - -"""Implements Cloud ML Operation wrapper.""" - -import datalab.utils -import datalab.context -from googleapiclient import discovery - - -# TODO(qimingj) Remove once the API is public since it will no longer be needed -_CLOUDML_DISCOVERY_URL = 'https://storage.googleapis.com/cloud-ml/discovery/' \ - 'ml_v1beta1_discovery.json' - - -class Job(object): - """Represents a Cloud ML job.""" - - def __init__(self, name, context=None, api=None): - """Initializes an instance of a CloudML Job. - - Args: - name: the name of the job. It can be an operation full name - ("projects/[project_id]/operations/[operation_name]") or just [operation_name]. - context: an optional Context object providing project_id and credentials. - api: optional CloudML API client. - """ - if context is None: - context = datalab.context.Context.default() - self._context = context - if api is None: - api = discovery.build('ml', 'v1beta1', credentials=self._context.credentials, - discoveryServiceUrl=_CLOUDML_DISCOVERY_URL) - self._api = api - if not name.startswith('projects/'): - name = 'projects/' + self._context.project_id + '/jobs/' + name - self._name = name - self.refresh() - - @property - def info(self): - return self._info - - def refresh(self): - """ Refresh the job info. """ - self._info = self._api.projects().jobs().get(name=self._name).execute() - - -class Jobs(object): - """Represents a list of Cloud ML jobs for a project.""" - - def __init__(self, filter=None, context=None, api=None): - """Initializes an instance of a CloudML Job list that is iteratable ("for job in jobs()"). - - Args: - filter: filter string for retrieving jobs. Currently only "done=true|false" is supported. - context: an optional Context object providing project_id and credentials. - api: an optional CloudML API client. - """ - self._filter = filter - if context is None: - context = datalab.context.Context.default() - self._context = context - if api is None: - api = discovery.build('ml', 'v1beta1', credentials=self._context.credentials, - discoveryServiceUrl=_CLOUDML_DISCOVERY_URL) - self._api = api - - def _retrieve_jobs(self, page_token, page_size): - list_info = self._api.projects().jobs().list(parent='projects/' + self._context.project_id, - pageToken=page_token, pageSize=page_size, - filter=self._filter).execute() - jobs = list_info.get('jobs', []) - page_token = list_info.get('nextPageToken', None) - return jobs, page_token - - def __iter__(self): - return iter(datalab.utils.Iterator(self._retrieve_jobs)) - - def get_job_by_name(self, name): - """ get a CloudML job by its name. - Args: - name: the name of the job. See "Job" class constructor. - """ - return Job(name, self._context, self._api) - diff --git a/datalab/mlalpha/_local_predictor.py b/datalab/mlalpha/_local_predictor.py deleted file mode 100644 index f488d863f..000000000 --- a/datalab/mlalpha/_local_predictor.py +++ /dev/null @@ -1,107 +0,0 @@ -# Copyright 2016 Google Inc. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. - - -import collections -import json -from numbers import Number -import numpy -import os -import pandas as pd - -import google.cloud.ml as ml - -from . import _metadata - - -class LocalPredictor(object): - """Preforms local predictions on given data. - """ - - def __init__(self, model_dir, label_output=None): - """Initializes an instance of LocalPredictor. - - Args: - model_dir: a directory that contains model checkpoint and metagraph. Can be local or GCS. - label_output: the name of the output column where all values should be converted from - index to labels. Only useful in classification. If specified, a metadata.yaml file is required. - """ - self._model_dir = model_dir - self._metadata_path = None - self._metadata = None - metadata_path = os.path.join(model_dir, "metadata.yaml") - if ml.util._file.file_exists(metadata_path): - self._metadata_path = metadata_path - self._metadata = _metadata.Metadata(metadata_path) - self._label_output = label_output - - def predict(self, data): - """Make predictions on given data. - - Args: - data: a list of feature data or a pandas DataFrame. Each element in the list is an instance - which is a dictionary of feature data. - An example: - [{"sepal_length": 4.9, "sepal_width": 2.5, "petal_length": 4.5, "petal_width": 1.7}, - {"sepal_length": 5.7, "sepal_width": 2.8, "petal_length": 4.1, "petal_width": 1.3}] - - Returns: - A list of prediction results for given instances. Each element is a dictionary representing - output mapping from the graph. - An example: - [{"predictions": 1, "score": [0.00078, 0.71406, 0.28515]}, - {"predictions": 1, "score": [0.00244, 0.99634, 0.00121]}] - - Raises: Exception if the prediction result has incorrect label types - """ - session, _ = ml.session_bundle.load_session_bundle_from_path(self._model_dir) - # get the mappings between aliases and tensor names for both inputs and outputs - input_key = json.loads(session.graph.get_collection( - ml.session_bundle.INPUTS_KEY)[0]).values()[0] - output_alias_map = json.loads(session.graph.get_collection(ml.session_bundle.OUTPUTS_KEY)[0]) - aliases, tensornames = zip(*output_alias_map.items()) - - if isinstance(data, pd.DataFrame): - data = data.T.to_dict().values() - - feed_dict = collections.defaultdict(list) - if self._metadata_path is not None: - transformer = ml.features.FeatureProducer(self._metadata_path) - for instance in data: - preprocessed = transformer.preprocess(instance) - feed_dict[input_key].append(preprocessed.SerializeToString()) - else: - for instance in data: - feed_dict[input_key].append(json.dumps(instance)) - - result = session.run(fetches=tensornames, feed_dict=feed_dict) - if len(result) == 1: - result = [result] - predictions = [] - for row in zip(*result): - prediction_row = {} - for name, value in zip(aliases, row): - if (self._metadata is not None and self._label_output is not None - and name == self._label_output): - if not isinstance(value, Number): - raise Exception('Cannot get labels because output "%s" is type %s but not number.' - % (name, type(value))) - prediction_row[name] = str(self._metadata.get_classification_label(value)) + \ - (' (%d)' % value) - elif isinstance(value, numpy.generic): - prediction_row[name] = numpy.asscalar(value) - elif isinstance(value, numpy.ndarray): - prediction_row[name] = value.tolist() - else: - prediction_row[name] = value - predictions.append(prediction_row) - return predictions diff --git a/datalab/mlalpha/_local_runner.py b/datalab/mlalpha/_local_runner.py deleted file mode 100644 index 6f2fa2f96..000000000 --- a/datalab/mlalpha/_local_runner.py +++ /dev/null @@ -1,225 +0,0 @@ -# Copyright 2016 Google Inc. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. - - -import io -import json -import os -import psutil -import subprocess -import tempfile -import time - -import google.cloud.ml as ml - -import datalab.utils - - -def _wait_and_kill(pid_to_wait, pids_to_kill): - """ Helper function. - Wait for a process to finish if it exists, and then try to kill a list of processes. - Args: - pid_to_wait: the process to wait for. - pids_to_kill: a list of processes to kill after the process of pid_to_wait finishes. - """ - if psutil.pid_exists(pid_to_wait): - psutil.Process(pid=pid_to_wait).wait() - - for pid_to_kill in pids_to_kill: - if psutil.pid_exists(pid_to_kill): - p = psutil.Process(pid=pid_to_kill) - p.kill() - p.wait() - - -class LocalRunner(object): - """Provides a "distributed" local run of a CloudML trainer packaged in a tarball. - It simulates CloudML service by spawning master, worker, and ps processses, - but instead of running in their own GKE cluster pods it runs all these as local processes. - """ - - def __init__(self, tar_path, module_to_run, logdir, replica_spec, program_args, all_args): - """Initializes an instance of a LocalRunner - - Args: - tar_path: the path of the trainer packaged in a tarball. Can be a local path or a GCS path. - module_to_run: the module to run in the tarball. - logdir: the directory to save logs. - replica_spec: the number of replicas for each job_type. - For example, {'master': 1, 'worker': 1, 'ps': 1}. - Currently it supports at most one process for each job_type, and 'master' is required. - program_args: the arguments of the training job program. For example, - { - 'train_data_paths': ['/content/mydata/features_train'], - 'eval_data_paths': ['/content/mydata/features_eval'], - 'metadata_path': '/content/mydata/metadata.yaml', - 'output_path': '/content/mymodel/', - } - all_args: all args that can be submitted to cloud training such as job name, replicas, etc. - It is aligned to the CloudML training service interface. In the program, it can be - retrieved by 'TF_CONFIG' env var (json serialized) under 'job' member. - - Raises: - Exception if replica_spec does not contain 'master' or its value is below one. - """ - self._tar_path = tar_path - self._module_to_run = module_to_run - self._logdir = logdir - self._replica_spec = replica_spec - self._program_args = program_args - if self._program_args is None: - self._program_args = {} - self._all_args = all_args - if self._all_args is None: - self._all_args = {} - self._cluster_spec = self._create_cluster_spec() - self._task_processes = {} - self._log_writers = {} - self._log_readers = {} - self._monitor_process = None - - def _create_cluster_spec(self): - """Create cluster spec that will be passed to each task process as command parameter. - This matches CloudML training service behavior. - """ - spec = {} - for job_type, replicas in self._replica_spec.iteritems(): - if replicas > 0: - port = datalab.utils.pick_unused_port() - spec[job_type] = ['localhost:' + str(port)] - if 'master' not in spec: - raise Exception('Need to have at least 1 master replica') - return spec - - def _create_tf_config(self, job_type): - """Create a list of arguments that will be passed to task process as command - parameters. This matches CloudML training service behavior. - """ - task_spec = {'type': job_type, 'index': 0} - return { - 'cluster': self._cluster_spec, - 'task': task_spec, - 'job': self._all_args, - } - - def _create_task_args(self): - args = [ - 'python', - '-m', - self._module_to_run, - ] - for k,v in self._program_args.iteritems(): - if isinstance(v, list): - for item in v: - args.append('--' + k) - args.append(str(item)) - else: - args.append('--' + k) - args.append(str(v)) - return args - - def _extract_tar(self): - extracted_dir = tempfile.mkdtemp() - tar_path = self._tar_path - if tar_path.startswith('gs://'): - tar_path = os.path.join(extracted_dir, os.path.basename(tar_path)) - ml.util._file.copy_file(self._tar_path, tar_path) - subprocess.check_call(['pip', 'install', tar_path, '--target', extracted_dir, - '--upgrade', '--force-reinstall']) - return extracted_dir - - def _clean_up(self): - processes_to_clean = list(self._task_processes.values()) - if self._monitor_process is not None: - processes_to_clean.append(self._monitor_process) - - for p in processes_to_clean: - if p.poll() is None: - # TODO(qimingj): consider p.kill() if it does not terminate in certain time. - p.terminate() - p.wait() - - for k,v in self._log_readers.iteritems(): - v.close() - for k,v in self._log_writers.iteritems(): - v.close() - - def _start_externel_monitor_process(self): - """Create a process that monitors the current process. If the current process exists, - Clean up a list of target processes. - This is needed to kill all running training processes when the kernel is restarted. - Note that Jupyter does not kill child processes when kernel is restarted. "atexit" - hook doesn't work either if the kernel is busy such as in time.sleep (seems SIGKILL - is used to restart the kernel). - """ - pids_to_kill = [p.pid for p in self._task_processes.values()] - script = 'import %s; %s._wait_and_kill(%s, %s)' % \ - (__name__, __name__, str(os.getpid()), str(pids_to_kill)) - self._monitor_process = subprocess.Popen(['python', '-c', script]) - - def _start_task_process(self, workdir, job_type): - args = self._create_task_args() - logfile = os.path.join(self._logdir, job_type) - # We cannot pipe child process's stdout and stderr directly because - # we need to append 'master', 'worker', 'ps' at begining of each - # log entry. A memory stream such as StringIO does not work here because - # Popen expects file descriptor. Therefore physical files are used to back - # up the messages. - w = io.open(logfile, 'w') - r = io.open(logfile, 'r') - tf_config = self._create_tf_config(job_type) - env = os.environ.copy() - env['TF_CONFIG'] = json.dumps(tf_config) - p = subprocess.Popen(args, env=env, stdout=w, stderr=w) - self._log_writers[job_type] = w - self._log_readers[job_type] = r - self._task_processes[job_type] = p - - def _print_task_output(self, callback, param, done): - if callback is None: - return - new_msgs = [] - for job_type, reader in self._log_readers.iteritems(): - content = reader.read() - if (content): - lines = content.split('\n') - for line in lines: - new_msgs.append(job_type + ': '+ line) - callback(self._replica_spec, new_msgs, done, param) - - def run(self, callback, param, interval): - """Run a training job locally. Block the caller until it finishes. - Prints out task processes stdout and stderr. - - Args: - callback: a callback that will be invoked every "interval" seconds. The signature is: - callback(replica_spec, new_msgs, done, param), where: - replica_spec: the replica spec of the runner - new_msgs: new output messages that are available from all task processes - done: whether the job is finished - param: a callback param. - param: the callback param that will be passed along whenever the callback is invoked. - interval: the interval in seconds controlling the callback frequency. - """ - workdir = self._extract_tar() - previous_cwd = os.getcwd() - os.chdir(workdir) - for job_type, replicas in self._replica_spec.iteritems(): - if replicas > 0: - self._start_task_process(workdir, job_type) - os.chdir(previous_cwd) - self._start_externel_monitor_process() - while self._task_processes['master'].poll() is None: - self._print_task_output(callback, param, False) - time.sleep(interval) - self._print_task_output(callback, param, True) - self._clean_up() diff --git a/datalab/mlalpha/_metadata.py b/datalab/mlalpha/_metadata.py deleted file mode 100644 index 4d8da347d..000000000 --- a/datalab/mlalpha/_metadata.py +++ /dev/null @@ -1,86 +0,0 @@ -# Copyright 2016 Google Inc. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. - -import yaml - -import google.cloud.ml as ml - - -class Metadata(object): - """Helper class for parsing and serving feature metadata. - """ - - def __init__(self, metadata_path): - """Initializes an instance of Metadata. - - Args: - metadata_path: metadata file path. Can be local or GCS path. - """ - self._metadata_path = metadata_path - self._labels = {} - - def get_classification_label(self, label_index): - """Get classification label given a label index. - - Args: - label_index: the index of label. - - Returns: - The classification label, or label_index if the metadata is not for classification. - - Raises: - Exception if metadata is malformed. - """ - if len(self._labels) == 0: - with ml.util._file.open_local_or_gcs(self._metadata_path, 'r') as f: - metadata = yaml.load(f) - if 'columns' not in metadata: - raise Exception('Invalid metadata. No columns found.') - for column_name, column in metadata['columns'].iteritems(): - scenario = column.get('scenario', None) - # classification is the old name and now is called discrete. - if scenario == 'classification' or scenario == 'discrete': - if 'items' not in column: - raise Exception('Invalid metadata. No "items" found for "%s"' % column_name) - for label, index in column['items'].iteritems(): - self._labels[index] = label - break - elif scenario is not None: - return label_index # target column found but not classification - if len(self._labels) == 0: - raise Exception('Invalid metadata. No classification labels found.') - return self._labels[label_index] - - def get_target_name_and_scenario(self): - """Get name of the target feature and scenario. - - Returns: - Name of the target feature or scenario - - Raises: - Exception if metadata is malformed. - """ - with ml.util._file.open_local_or_gcs(self._metadata_path, 'r') as f: - metadata = yaml.load(f) - if 'features' not in metadata or 'columns' not in metadata: - raise Exception('Invalid metadata. No features or columns found.') - target_column_name, scenario = None, None - for column_name, column in metadata['columns'].iteritems(): - if 'scenario' in column: - target_column_name, scenario = column_name, column['scenario'] - break - if target_column_name is None: - raise Exception('Invalid metadata. No target found in columns.') - for feature_name, feature in metadata['features'].iteritems(): - if feature['columns'][0] == target_column_name: - return feature_name, scenario - raise Exception('Invalid metadata. No target found in features.') diff --git a/datalab/mlalpha/_package.py b/datalab/mlalpha/_package.py deleted file mode 100644 index 7b5a45987..000000000 --- a/datalab/mlalpha/_package.py +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright 2016 Google Inc. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. - -"""Implements Tarball Packaging for CloudML Training Program.""" - - -import os -import shutil -import subprocess -import tempfile - - -class Packager(object): - """Represents a packager.""" - - def package(self, files, package_name): - """Package a list of file contents into a python tarball package. - - Args: - files: A dictionary with key as module name and value as module contents. File names - will be key + 'py'. - package_name: the name of the package. - - Returns: the path of the created package, in a temp directory. - """ - tempdir = tempfile.mkdtemp() - trainer_dir = os.path.join(tempdir, package_name) - os.mkdir(trainer_dir) - files['__init__'] = '' - for name, content in files.iteritems(): - file_path = os.path.join(trainer_dir, name + '.py') - with open(file_path, 'w') as file: - file.write(content) - - setup_py = os.path.join(tempdir, 'setup.py') - content = """from setuptools import setup - -setup( - name='%s', - version='0.1', - packages=['%s'], -)""" % (package_name, package_name) - with open(setup_py, 'w') as file: - file.write(content) - previous_cwd = os.getcwd() - os.chdir(tempdir) - sdist = ['python', setup_py, 'sdist', '--format=gztar', '-d', tempdir] - subprocess.check_call(sdist) - os.chdir(previous_cwd) - shutil.rmtree(trainer_dir) - os.remove(setup_py) - return os.path.join(tempdir, '%s-0.1.tar.gz' % package_name) diff --git a/datalab/mlalpha/_summary.py b/datalab/mlalpha/_summary.py deleted file mode 100644 index 7db58bc28..000000000 --- a/datalab/mlalpha/_summary.py +++ /dev/null @@ -1,107 +0,0 @@ -# Copyright 2016 Google Inc. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. - -"""Implements Cloud ML Summary wrapper.""" - -import datetime -import glob -import os -from tensorflow.core.util import event_pb2 -from tensorflow.python.lib.io import tf_record - -import datalab.storage as storage - - -class Summary(object): - """Represents TensorFlow summary events from files under a directory.""" - - def __init__(self, path): - """Initializes an instance of a Summary. - - Args: - path: the path of the directory which holds TensorFlow events files. - Can be local path or GCS path. - """ - self._path = path - - def _get_events_files(self): - if self._path.startswith('gs://'): - storage._api.Api.verify_permitted_to_read(self._path) - bucket, prefix = storage._bucket.parse_name(self._path) - items = storage.Items(bucket, prefix, None) - filtered_list = [item.uri for item in items if os.path.basename(item.uri).find('tfevents')] - return filtered_list - else: - path_pattern = os.path.join(self._path, '*tfevents*') - return glob.glob(path_pattern) - - def list_events(self): - """List all scalar events in the directory. - - Returns: - A set of unique event tags. - """ - event_tags = set() - for event_file in self._get_events_files(): - for record in tf_record.tf_record_iterator(event_file): - event = event_pb2.Event.FromString(record) - if event.summary is None or event.summary.value is None: - continue - for value in event.summary.value: - if value.simple_value is None: - continue - if value.tag is not None and value.tag not in event_tags: - event_tags.add(value.tag) - return event_tags - - def get_events(self, event_name): - """Get all events of a certain tag. - - Args: - event_name: the tag of event to look for. - - Returns: - A tuple. First is a list of {time_span, event_name}. Second is a list of {step, event_name}. - - Raises: - Exception if event start time cannot be found - """ - events_time = [] - events_step = [] - event_start_time = None - for event_file in self._get_events_files(): - for record in tf_record.tf_record_iterator(event_file): - event = event_pb2.Event.FromString(record) - if event.file_version is not None: - # first event in the file. - time = datetime.datetime.fromtimestamp(event.wall_time) - if event_start_time is None or event_start_time > time: - event_start_time = time - - if event.summary is None or event.summary.value is None: - continue - for value in event.summary.value: - if value.simple_value is None or value.tag is None: - continue - if value.tag == event_name: - if event.wall_time is not None: - time = datetime.datetime.fromtimestamp(event.wall_time) - events_time.append({'time': time, event_name: value.simple_value}) - if event.step is not None: - events_step.append({'step': event.step, event_name: value.simple_value}) - if event_start_time is None: - raise Exception('Empty or invalid TF events file. Cannot find event start time.') - for event in events_time: - event['time'] = event['time'] - event_start_time # convert time to timespan - events_time = sorted(events_time, key=lambda k: k['time']) - events_step = sorted(events_step, key=lambda k: k['step']) - return events_time, events_step diff --git a/datalab/mlalpha/commands/_mlalpha.py b/datalab/mlalpha/commands/_mlalpha.py deleted file mode 100644 index 4bcfa4127..000000000 --- a/datalab/mlalpha/commands/_mlalpha.py +++ /dev/null @@ -1,973 +0,0 @@ -# Copyright 2016 Google Inc. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. - -try: - import IPython - import IPython.core.magic -except ImportError: - raise Exception('This module can only be loaded in ipython.') - - -import collections -import fnmatch -import google.cloud.ml -import json -import math -import os -import plotly.graph_objs as go -from plotly.offline import iplot -import urllib -import yaml - -import datalab.context -import datalab.data -import datalab.mlalpha -import datalab.utils.commands - - -@IPython.core.magic.register_line_cell_magic -def mlalpha(line, cell=None): - """Implements the ml line cell magic. - - Args: - line: the contents of the ml line. - cell: the contents of the ml cell. - - Returns: - The results of executing the cell. - """ - parser = datalab.utils.commands.CommandParser(prog="mlalpha", description=""" -Execute various ml-related operations. Use "%%mlalpha -h" for help on a specific command. -""") - train_parser = parser.subcommand('train', 'Run a training job.') - train_parser.add_argument('--cloud', - help='Whether to run the training job in the cloud.', - action='store_true', default=False) - train_parser.set_defaults(func=_train) - jobs_parser = parser.subcommand('jobs', 'List jobs in a project.') - jobs_parser.add_argument('--count', - help='The number of jobs to browse from head, default to 10.') - jobs_parser.add_argument('--filter', help='Filter on jobs.') - jobs_parser.add_argument('--name', - help='The name of the operation to retrieve. If provided, show ' + - 'detailed information of the operation') - jobs_parser.add_argument('--trials', - help='Whether to show hyperparams tuning graph.', - action='store_true', default=False) - jobs_parser.set_defaults(func=_jobs) - summary_parser = parser.subcommand('summary', 'List or view summary events.') - summary_parser.add_argument('--dir', - help='A list of dirs to look for events. Can be local or GCS path.', - nargs='+', required=True) - summary_parser.add_argument('--name', - help='Names of the summary event. If provided, ' + - 'plot specified events in same graph (so make sure their ' + - 'units match). Otherwise, list all the unique event ' + - 'names from files in the directory.', nargs='*') - summary_parser.add_argument('--time', help='Whether to plot time events only.', - action='store_true', default=False) - summary_parser.add_argument('--step', help='Whether to plot step events only.', - action='store_true', default=False) - summary_parser.set_defaults(func=_summary) - features_parser = parser.subcommand('features', 'Generate featureset class template.') - features_parser.set_defaults(func=_features) - predict_parser = parser.subcommand('predict', 'Get prediction results given data instances.') - predict_parser.add_argument('--cloud', - help='Whether to run the prediction in the cloud.', - action='store_true', default=False) - predict_parser.add_argument('--model', - help='Model identifier. In local prediction, it is the path to ' + - 'a model directory. In cloud prediction (--cloud), it is ' + - 'model.version.', required=True) - predict_parser.add_argument('--label', - help='In classification scenario, which output in the graph ' + - 'is the label index. If provided, the index will be ' + - 'converted to label string.') - predict_parser.add_argument('--data', - help='The instance data used to predict. It can be a dataframe ' + - 'or a list defined in another cell. If not provided, the ' + - 'data needs to be provided in current cell input.') - predict_parser.add_argument('--project', - help='The project for the cloud model to use. if not provided, ' + - 'current project is used. Only needed in cloud prediction ' + - '(--cloud)') - predict_parser.set_defaults(func=_predict) - model_parser = parser.subcommand('model', 'List or view models.') - model_parser.add_argument('--name', - help='The name and version of the model. If "model.version", ' + - 'display the details of the model version. If "model", ' + - 'list the versions of the model. If not provided, list ' + - 'models under the project.') - model_parser.add_argument('--project', - help='The project under which it looks for models. if not ' + - 'provided, current project is used.') - model_parser.set_defaults(func=_model) - deploy_parser = parser.subcommand('deploy', 'Deploy a model.') - deploy_parser.add_argument('--name', - help='The name and version of the model in the form of ' + - '"model.version".', required=True) - deploy_parser.add_argument('--path', - help='The Google Cloud Storage path of the directory that ' + - 'contains an exported model.', required=True) - deploy_parser.add_argument('--project', - help='The project under which the model will be deployed. if not ' + - 'provided, current project is used.') - deploy_parser.set_defaults(func=_deploy) - delete_parser = parser.subcommand('delete', 'Delete a model or a model version.') - delete_parser.add_argument('--name', - help='The name and version of the model. If "model.version", ' + - 'delete the model version. If "model", delete the model.', - required=True) - delete_parser.add_argument('--project', - help='The project under which the model or version will be ' + - 'deleted. if not provided, current project is used.') - delete_parser.set_defaults(func=_delete) - preprocess_parser = parser.subcommand('preprocess', 'Generate preprocess code template.') - preprocess_parser.add_argument('--cloud', - help='Whether to produce code running in cloud.', - action='store_true', default=False) - preprocess_parser.set_defaults(func=_preprocess) - evaluate_parser = parser.subcommand('evaluate', 'Generate evaluate code template.') - evaluate_parser.add_argument('--cloud', - help='Whether to produce code running in cloud.', - action='store_true', default=False) - evaluate_parser.set_defaults(func=_evaluate) - dataset_parser = parser.subcommand('dataset', 'Define dataset to explore data.') - dataset_parser.add_argument('--name', - help='The name of the dataset to define.', required=True) - dataset_parser.set_defaults(func=_dataset) - module_parser = parser.subcommand('module', 'Define a trainer module.') - module_parser.add_argument('--name', help='The name of the module.', required=True) - module_parser.add_argument('--main', - help='Whether this module is has main function in the trainer ' + - 'package.', - action='store_true', default=False) - module_parser.set_defaults(func=_module) - package_parser = parser.subcommand('package','Create a trainer package from all modules ' + - 'defined with %%mlalpha module.') - package_parser.add_argument('--name', help='The name of the package.', required=True) - package_parser.add_argument('--output', help='the output dir of the package.', required=True) - package_parser.set_defaults(func=_package) - namespace = datalab.utils.commands.notebook_environment() - return datalab.utils.commands.handle_magic_line(line, cell, parser, namespace=namespace) - - -def _get_replica_count(config): - worker_count = config.get('worker_count', 0) - parameter_server_count = config.get('parameter_server_count', 0) - if worker_count > 0 or parameter_server_count > 0: - return 1, worker_count, parameter_server_count - scale_tier = config.get('scale_tier', 'BASIC') - if scale_tier == 'BASIC': - return 1, 0, 0 - else: - return 1, 1, 1 - - -def _local_train_callback(replica_spec, new_msgs, done, all_msgs): - if new_msgs: - all_msgs += new_msgs - del all_msgs[0:-20] - IPython.display.clear_output() - IPython.display.display_html('

Job Running...

', raw=True) - log_file_html = '' - log_url_prefix = '' - if datalab.context._utils._in_datalab_docker(): - log_url_prefix = '/_nocachecontent/' - for job_type, replicas in replica_spec.iteritems(): - if replicas > 0: - log_file_html += ('%s log  ' - % (log_url_prefix + job_type, job_type)) - IPython.display.display_html(log_file_html, raw=True) - IPython.display.display_html('
'.join(all_msgs), raw=True) - if done: - IPython.display.display_html('

Job Finished.

', raw=True) - - -def _output_train_template(): - content = """%%mlalpha train [--cloud] -package_uris: gs://your-bucket/my-training-package.tar.gz -python_module: your_program.your_module -scale_tier: BASIC -region: us-central1 -args: - string_arg: value - int_arg: value - appendable_arg: - - value1 - - value2 -""" - IPython.get_ipython().set_next_input(content) - parameters = ['package_uris', 'python_module', 'scale_tier', 'region', 'args'] - required_local = [False, False, False, False, False] - required_cloud = [True, True, True, True, False] - description = [ - 'A GCS or local (for local run only) path to your python training program package.', - 'The module to run.', - 'Type of resources requested for the job. On local run, BASIC means 1 master process only, ' + - 'and any other values mean 1 master 1 worker and 1 ps processes. But you can also ' + - 'override the values by setting worker_count and parameter_server_count. ' + - 'On cloud, see service definition for possible values.', - 'Where the training job runs. For cloud run only.', - 'Args that will be passed to your training program.' - ] - data = [{'Parameters': x[0], 'Local Run Required': str(x[1]), - 'Cloud Run Required': str(x[2]), 'Description': x[3]} - for x in zip(parameters, required_local, required_cloud, description)] - html = ('

A training input template is created in next cell for you. See cell input ' + - 'instructions below.

') - html += datalab.utils.commands.HtmlBuilder.render_table(data, - ['Parameters', 'Local Run Required', 'Cloud Run Required', 'Description']) - - return IPython.core.display.HTML(html) - - -def _train(args, cell): - """ Train a model. """ - if not cell: - return _output_train_template() - - env = datalab.utils.commands.notebook_environment() - config = datalab.utils.commands.parse_config(cell, env) - if args['cloud']: - datalab.utils.commands.validate_config_must_have(config, - ['package_uris', 'python_module', 'scale_tier', 'region']) - runner = datalab.mlalpha.CloudRunner(config) - job_info = runner.run() - job_short_name = job_info['jobId'] - html = '

Job "%s" was submitted successfully.
' % job_short_name - html += 'Run "%%mlalpha jobs --name %s" to view the status of the job.

' % job_short_name - log_url_query_strings = { - 'project': datalab.context.Context.default().project_id, - 'resource': 'ml.googleapis.com/job_id/' + job_short_name - } - log_url = 'https://console.developers.google.com/logs/viewer?' + \ - urllib.urlencode(log_url_query_strings) - html += '

Click here to view cloud log.
' % log_url - html += 'Start TensorBoard by running "%tensorboard start --logdir=<YourLogDir>".

' - return IPython.core.display.HTML(html) - else: - # local training - package_path = None - if 'package_uris' not in config: - if '_ml_modules_' not in env: - raise Exception('Expect either modules defined with "%%mlalpha module", ' + - 'or "package_uris" in cell.') - if '_ml_modules_main_' not in env: - raise Exception('Expect one ml module defined with "--main flag" as the python ' + - 'program entry point.') - package_path = datalab.mlalpha.Packager().package(env['_ml_modules_'], 'trainer') - config['package_uris'] = package_path - config['python_module'] = 'trainer.' + env['_ml_modules_main_'] - - trainer_uri = config['package_uris'] - module_name = config['python_module'] - masters, workers, parameter_servers = _get_replica_count(config) - replica_spec = {'master': masters, 'worker': workers, 'ps': parameter_servers} - all_messages = [] - log_dir = os.getcwd() - if datalab.context._utils._in_datalab_docker(): - log_dir = '/datalab/nocachecontent' - if not os.path.exists(log_dir): - os.makedirs(log_dir) - program_args = config.get('args', None) - runner = datalab.mlalpha.LocalRunner(trainer_uri, module_name, log_dir, replica_spec, program_args, config) - runner.run(_local_train_callback, all_messages, 3) - if package_path is not None: - os.remove(package_path) - - -def _plot_hyperparams_tuning(training_input, training_output): - if ('hyperparameters' not in training_input or 'trials' not in training_output or - len(training_output['trials']) == 0): - print 'No trials found. Maybe none of the trials has completed' - return - - maximize = training_input['hyperparameters']['goal'] == 'MAXIMIZE' - hyperparam_scales = {} - for param in training_input['hyperparameters']['params']: - hyperparam_scales[param['parameterName']] = param.get('scaleType', '') - instances = [] - for trial in training_output['trials']: - if 'finalMetric' not in trial: - continue - instance = collections.OrderedDict() - instance.update({'Objective': trial['finalMetric']['objectiveValue']}) - instance.update({'Trial': trial['trialId']}) - instance.update({'Training Step': trial['finalMetric']['trainingStep']}) - hyperparams = dict(trial['hyperparameters']) - for k in trial['hyperparameters'].keys(): - if hyperparam_scales.get(k, '') == 'UNIT_LOG_SCALE': - hyperparams[k + '(log)'] = math.log10(float(hyperparams[k])) - instance.update(hyperparams) - instances.append(instance) - if len(instances) == 0: - print 'No finalMetric found in any trials. ' - return - - instances_sorted = sorted(instances, key=lambda k: k['Objective'], reverse=maximize) - # Convert list of dictionary to dictionary of list so it is more compact. - data = instances_sorted[0] - for k in data.keys(): - data[k] = [d[k] for d in instances_sorted] - - HTML_TEMPLATE = """ -
-
- -""" - graph_id = 'v' + datalab.utils.commands.Html.next_id() - grid_id = 'g' + datalab.utils.commands.Html.next_id() - color_range_string = json.dumps([min(data['Objective']), max(data['Objective'])]) - data_string = json.dumps(data) - maximize_string = json.dumps(maximize) - html = HTML_TEMPLATE % (graph_id, grid_id, data_string, color_range_string, maximize_string, graph_id, grid_id) - return IPython.core.display.HTML(html) - - -def _jobs(args, _): - """ List the ML jobs in a project. """ - jobs = datalab.mlalpha.Jobs(filter=args['filter']) - if args['name']: - job = jobs.get_job_by_name(args['name']) - if args['trials']: - if ('trainingInput' not in job.info or 'trainingOutput' not in job.info): - print 'job %s doesn\'t seem like a hyperparameter tuning training job.' % args['name'] - return - return _plot_hyperparams_tuning(job.info['trainingInput'], job.info['trainingOutput']) - else: - job_yaml = yaml.safe_dump(job.info) - return datalab.utils.commands.render_text(job_yaml, preformatted=True) - else: - count = int(args['count'] or 10) - data = [] - for job in jobs: - if count <= 0: - break - count -= 1 - data.append({'Id': job['jobId'], 'State': job.get('state', 'UNKNOWN')}) - return datalab.utils.commands.render_dictionary(data, ['Id', 'State']) - - -def _plot(data, x_name, x_title, y_names): - y_title = ','.join(y_names) - layout = go.Layout( - title=y_title, - xaxis=dict( - title=x_title, - ), - yaxis=dict( - title=y_title, - ) - ) - plot_data = [] - for trace_name, trace_data in data.iteritems(): - for y_name, events in zip(y_names, trace_data): - x = [d[x_name] for d in events] - y = [d[y_name] for d in events] - plot_data.append({'x': x, 'y': y, 'name': y_name + '-' + trace_name}) - fig = go.Figure(data=plot_data, layout=layout) - iplot(fig) - - -def get_dirs(pattern): - """ Finds all matching dirs given a dir path - for example: - input: gs://mybucket/iris/hp/*/summaries - output: [gs://mybucket/iris/hp/1/summaries, gs://mybucket/iris/hp/2/summaries] - Args: - pattern: string pattern for the directory path - - Returns: - List of all matching directories - """ - dirs = set() - path = pattern.rstrip('/') - for p in google.cloud.ml.util._file.glob_files(path + '/*'): - dir = None - while True: - p = p[:p.rfind('/')] - if fnmatch.fnmatch(p, path): - dir = p - else: - break - if dir: - dirs.add(dir) - return list(dirs) - - -def _summary(args, _): - """ Display summary events in a directory. """ - dirs = args['dir'] - event_names = args['name'] - if event_names is not None and len(event_names) > 0: - time_data = {} - step_data = {} - dir_index = 0 - for dir_pattern in dirs: - for dir in get_dirs(dir_pattern): - dir_index += 1 - summary = datalab.mlalpha.Summary(dir) - trace_events_time = [] - trace_events_step = [] - for event_name in event_names: - events_time, events_step = summary.get_events(event_name) - for e in events_time: - e['time'] = e['time'].total_seconds() - trace_events_time.append(events_time) - trace_events_step.append(events_step) - # Try to find 'label' file under the dir. If found, use the content as label. - # Otherwise, use dir name as label. - label = dir - label_file = os.path.join(dir, 'label') - if google.cloud.ml.util._file.file_exists(label_file) == True: - label = 'dir%d/' % dir_index + google.cloud.ml.util._file.load_file(label_file) - time_data[label] = trace_events_time - step_data[label] = trace_events_step - if (not args['time'] and not args['step']) or args['time']: - _plot(time_data, 'time', 'seconds', event_names) - if (not args['time'] and not args['step']) or args['step']: - _plot(step_data, 'step', 'step', event_names) - else: - event_names = [] - for dir_pattern in dirs: - for dir in get_dirs(dir_pattern): - summary = datalab.mlalpha.Summary(dir) - event_names += summary.list_events() - event_names = list(set(event_names)) # remove duplicates - return datalab.utils.commands.render_list(event_names) - - -def _output_features_template(): - content = """%%mlalpha features -path: REQUIRED_Fill_In_Gcs_or_Local_Path -headers: List_Of_Column_Names_Seperated_By_Comma -target: REQUIRED_Fill_In_Name_Or_Index_Of_Target_Column -id: Fill_In_Name_Or_Index_Of_Id_Column -format: csv_or_tsv -""" - IPython.get_ipython().set_next_input(content, replace=True) - - -def _features(args, cell): - """ Generate FeatureSet Class From Data""" - if not cell: - _output_features_template() - return - - env = datalab.utils.commands.notebook_environment() - config = datalab.utils.commands.parse_config(cell, env) - datalab.utils.commands.validate_config(config, ['path', 'target'], - optional_keys=['headers', 'id', 'format']) - format = config.get('format', 'csv') - # For now, support CSV and TSV only. - datalab.utils.commands.validate_config_value(format, ['csv', 'tsv']) - delimiter = ',' if format == 'csv' else '\t' - csv = datalab.data.Csv(config['path'], delimiter=delimiter) - headers = None - if 'headers' in config: - headers = [e.strip() for e in config['headers'].split(',')] - df = csv.browse(max_lines=100, headers=headers) - command = '%%mlalpha features\n' + cell - _output_featureset_template(df.dtypes, config['target'], config.get('id', None), command) - - -def _output_featureset_template(dtypes, target_column, id_column, command): - if target_column not in dtypes: - if type(target_column) is int: - target_column = dtypes.keys()[target_column] - else: - raise Exception('Column "%s" not found. It can be a name in headers, or an index number.' - % target_column) - if id_column is not None and id_column not in dtypes: - if type(id_column) is int: - id_column = dtypes.keys()[id_column] - else: - raise Exception('Column "%s" not found. It can be a name in headers, or an index number.' - % id_column) - is_regression = str(dtypes[target_column]).startswith('int') or \ - str(dtypes[target_column]).startswith('float') - scenario = 'continuous' if is_regression == True else 'discrete' - columns_remaining = dict(dtypes) - command_lines = command.split('\n') - # add spaces in the beginning so they are aligned with others. - command_formatted = '\n'.join([' ' + line for line in command_lines]) - content = """import google.cloud.ml.features as features - -class CsvFeatures(object): - \"\"\"This class is generated from command line: -%s - Please modify it as appropriate!!! - \"\"\" - csv_columns = (%s) - %s = features.target('%s').%s() -""" % (command_formatted, ','.join(["'" + e + "'" for e in dtypes.keys()]), - target_column.replace('-', '_'), target_column, scenario) - del columns_remaining[target_column] - - if id_column is not None: - content += """ %s = features.key('%s')\n""" % (id_column.replace('-', '_'), id_column) - del columns_remaining[id_column] - - text_columns = [k for k,v in columns_remaining.iteritems() if str(v) == 'object'] - categorical_columns = [k for k,v in columns_remaining.iteritems() if str(v) == 'category'] - numeric_columns = [k for k in columns_remaining.keys() if k not in text_columns and - k not in categorical_columns] - if len(numeric_columns) + len(categorical_columns) > 0: - content += """ attrs = [\n""" - for numeric_name in numeric_columns: - content += """ features.numeric('%s').identity(),\n""" % numeric_name - for categorical_name in categorical_columns: - content += """ features.categorical('%s'),\n""" % categorical_name - content += """ ]\n""" - if len(text_columns) > 0: - for text_name in text_columns: - content += """ %s = features.text('%s').bag_of_words(vocab_size=10000)\n\n""" % \ - (text_name.replace('-', '_'), text_name) - IPython.get_ipython().set_next_input(content, replace=True) - - -def _predict(args, cell): - if args['data'] is not None: - instances = datalab.utils.commands.get_notebook_item(args['data']) - if instances is None: - raise Exception('Data "%s" is not defined' % args['data']) - elif cell is not None: - instances = [] - lines = cell.split('\n') - for line in lines: - instances.append(line) - else: - raise Exception('Expect instance data. Can be provided in input cell, or through ' - '--data args.') - if args['cloud']: - parts = args['model'].split('.') - if len(parts) != 2: - raise Exception('Invalid model name for cloud prediction. Use "model.version".') - lp = datalab.mlalpha.CloudPredictor(parts[0], parts[1], - label_output=args['label'], - project_id=args['project']) - else: - lp = datalab.mlalpha.LocalPredictor(args['model'], - label_output=args['label']) - return datalab.utils.commands.render_text(yaml.safe_dump(lp.predict(instances), - default_flow_style=False), - preformatted=True) - - -def _model(args, _): - if args['name'] is None: - data = list(datalab.mlalpha.CloudModels(project_id=args['project'])) - if len(data) > 0: - return datalab.utils.commands.render_dictionary(data, data[0].keys()) - print 'No models found.' - return - - parts = args['name'].split('.') - if len(parts) == 1: - data = list(datalab.mlalpha.CloudModelVersions(parts[0], project_id=args['project'])) - if len(data) > 0: - return datalab.utils.commands.render_dictionary(data, data[0].keys()) - print 'No versions found.' - return - elif len(parts) == 2: - versions = datalab.mlalpha.CloudModelVersions(parts[0], project_id=args['project']) - version_yaml = yaml.safe_dump(versions.get(parts[1])) - return datalab.utils.commands.render_text(version_yaml, preformatted=True) - else: - raise Exception('Too many "." in name. Use "model" or "model.version".') - - -def _deploy(args, _): - parts = args['name'].split('.') - if len(parts) != 2: - raise Exception('Invalid model name. Use "model.version".') - versions = datalab.mlalpha.CloudModelVersions(parts[0], project_id=args['project']) - versions.deploy(parts[1], args['path']) - - -def _delete(args, _): - parts = args['name'].split('.') - if len(parts) == 1: - models = datalab.mlalpha.CloudModels(project_id=args['project']) - models.delete(parts[0]) - elif len(parts) == 2: - versions = datalab.mlalpha.CloudModelVersions(parts[0], project_id=args['project']) - versions.delete(parts[1]) - else: - raise Exception('Too many "." in name. Use "model" or "model.version".') - - -def _output_preprocess_template(is_cloud): - content = """%%mlalpha preprocess""" - if is_cloud: - content += ' --cloud' - content += """ -train_data_path: REQUIRED_Fill_In_Training_Data_Path -eval_data_path: Fill_In_Evaluation_Data_Path -data_format: REQUIRED_CSV_or_JSON -output_dir: REQUIRED_Fill_In_Output_Path -feature_set_class_name: REQUIRED_Fill_In_FeatureSet_Class_name -""" - IPython.get_ipython().set_next_input(content, replace=True) - - -def _pipeline_definition_code(is_cloud, job_name_prefix): - # TODO: Remove 'extra_packages' once it is not needed by dataflow. - if is_cloud: - content_pipeline = \ -"""import datetime -options = { - 'staging_location': os.path.join(OUTPUT_DIR, 'tmp', 'staging'), - 'temp_location': os.path.join(OUTPUT_DIR, 'tmp'), - 'job_name': '%s' + '-' + datetime.datetime.now().strftime('%%y%%m%%d-%%H%%M%%S'), - 'project': '%s', - 'extra_packages': [ml.sdk_location], - 'teardown_policy': 'TEARDOWN_ALWAYS', - 'no_save_main_session': True -} -opts = beam.pipeline.PipelineOptions(flags=[], **options) -pipeline = beam.Pipeline('DataflowPipelineRunner', options=opts) -""" % (job_name_prefix, datalab.context.Context.default().project_id) - else: - content_pipeline = """pipeline = beam.Pipeline('DirectPipelineRunner')\n""" - - return content_pipeline - - -def _header_code(command): - header = \ -"""\"\"\" -Following code is generated from command line: -%s\n -Please modify as appropriate!!! -\"\"\" -""" % command - return header - - -def _output_preprocess_code_template(command, is_cloud, data_format, train_data_path, - output_dir, feature_set_class_name, eval_data_path=None): - content_header = _header_code(command) - - content_imports = \ -"""import apache_beam as beam -import google.cloud.ml as ml -import google.cloud.ml.io as io -import os -""" - - if data_format == 'CSV': - coder = """io.CsvCoder.from_feature_set(feature_set, feature_set.csv_columns)""" - else: # JSON - coder = """io.JsonCoder.from_feature_set(feature_set)""" - job_name_prefix = 'preprocess-' + feature_set_class_name.lower().replace('_', '-') - - content_defines = \ -"""feature_set = %s() -OUTPUT_DIR = '%s' -%s -""" % (feature_set_class_name, output_dir, _pipeline_definition_code(is_cloud, job_name_prefix)) - - content_preprocessing = \ -"""training_data = beam.io.TextFileSource( - '%s', - strip_trailing_newlines=True, - coder=%s) -train = pipeline | beam.Read('ReadTrainingData', training_data) -""" % (train_data_path, coder) - - if eval_data_path is not None: - content_preprocessing += \ -""" -eval_data = beam.io.TextFileSource( - '%s', - strip_trailing_newlines=True, - coder=%s) -eval = pipeline | beam.Read('ReadEvalData', eval_data) -""" % (eval_data_path, coder) - - eval_features = ', eval_features' if eval_data_path is not None else '' - eval_input = ', eval' if eval_data_path is not None else '' - - content_preprocessing += \ -""" -(metadata, train_features%s) = ((train%s) | 'Preprocess' - >> ml.Preprocess(feature_set, input_format='csv', - format_metadata={'headers': feature_set.csv_columns})) -""" % (eval_features, eval_input) - - content_preprocessing += \ -""" -(metadata | 'SaveMetadata' - >> io.SaveMetadata(os.path.join(OUTPUT_DIR, 'metadata.yaml'))) -""" - - content_preprocessing += \ -""" -(train_features | 'SaveTrain' - >> io.SaveFeatures(os.path.join(OUTPUT_DIR, 'features_train'), shard_name_template='')) -""" - - if eval_data_path is not None: - content_preprocessing += \ -""" -(eval_features | 'SaveEval' - >> io.SaveFeatures(os.path.join(OUTPUT_DIR, 'features_eval'), shard_name_template='')) -""" - - content_run = """pipeline.run()""" - - content = \ -""" -# header -%s -# imports -%s -# defines -%s -# preprocessing -%s -# run pipeline -%s -""" % (content_header, content_imports, content_defines, content_preprocessing, content_run) - IPython.get_ipython().set_next_input(content, replace=True) - - -def _preprocess(args, cell): - if not cell: - _output_preprocess_template(args['cloud']) - return - - env = datalab.utils.commands.notebook_environment() - config = datalab.utils.commands.parse_config(cell, env) - datalab.utils.commands.validate_config(config, - ['train_data_path', 'data_format', 'output_dir', 'feature_set_class_name'], - optional_keys=['eval_data_path']) - datalab.utils.commands.validate_config_value(config['data_format'], ['CSV', 'JSON']) - command = '%%mlalpha preprocess' - if args['cloud']: - command += ' --cloud' - command += '\n' + cell - _output_preprocess_code_template(command, args['cloud'], config['data_format'], - config['train_data_path'], config['output_dir'], config['feature_set_class_name'], - eval_data_path=config.get('eval_data_path', None)) - - -def _output_evaluate_template(is_cloud): - content = """%%mlalpha evaluate""" - if is_cloud: - content += ' --cloud' - content += """ -preprocessed_eval_data_path: REQUIRED_Fill_In_Eval_Data_Path -metadata_path: REQUIRED_Fill_In_Metadata_Path -model_dir: REQUIRED_Fill_In_Model_Path -output_dir: REQUIRED_Fill_In_Output_Path -output_prediction_name: Fill_In_Prediction_Output -output_score_name: Fill_In_Score_Output -""" - IPython.get_ipython().set_next_input(content, replace=True) - - -def _output_evaluate_code_template(command, is_cloud, preprocessed_eval_data_path, - metadata_path, model_dir, output_dir, - output_prediction_name=None, output_score_name=None): - # output_prediction_name is only useful for generating results analysis code. - # It is only used in classification but not regression. - content_header = _header_code(command) - - content_imports = \ -"""import apache_beam as beam -import google.cloud.ml as ml -import google.cloud.ml.analysis as analysis -import google.cloud.ml.io as io -import json -import os -""" - - target_name, scenario = datalab.mlalpha.Metadata(metadata_path).get_target_name_and_scenario() - target_type = 'float_list' if scenario == 'continuous' else 'int64_list' - content_definitions = \ -"""def extract_values((example, prediction)): - import tensorflow as tf - tf_example = tf.train.Example() - tf_example.ParseFromString(example.values()[0]) - feature_map = tf_example.features.feature - values = {'target': feature_map['%s'].%s.value[0]} - values.update(prediction) - return values - -OUTPUT_DIR = '%s' -%s -""" % (target_name, target_type, output_dir, _pipeline_definition_code(is_cloud, 'evaluate')) - - content_evaluation = \ -""" -eval_features = (pipeline | 'ReadEval' >> io.LoadFeatures('%s')) -trained_model = pipeline | 'LoadModel' >> io.LoadModel('%s') -evaluations = (eval_features | 'Evaluate' >> ml.Evaluate(trained_model) | - beam.Map('ExtractEvaluationResults', extract_values)) -eval_data_sink = beam.io.TextFileSink(os.path.join(OUTPUT_DIR, 'eval'), shard_name_template='') -evaluations | beam.io.textio.WriteToText(os.path.join(OUTPUT_DIR, 'eval'), shard_name_template='') -""" % (preprocessed_eval_data_path, model_dir) - - output_analysis = (output_prediction_name is not None and scenario != 'continuous') - content_analysis = '' - if output_analysis: - if output_score_name is not None: - score_value = """values['%s'][values['%s']]""" % (output_score_name, output_prediction_name) - else: - score_value = '0.0' - - content_analysis = \ -"""def make_data_for_analysis(values): - return { - 'target': values['target'], - 'predicted': values['%s'], - 'score': %s, - } - -metadata = pipeline | io.LoadMetadata('%s') -analysis_source = evaluations | beam.Map('CreateAnalysisSource', make_data_for_analysis) -confusion_matrix, precision_recall, logloss = (analysis_source | - 'Analyze Model' >> analysis.AnalyzeModel(metadata)) -confusion_matrix_file = os.path.join(OUTPUT_DIR, 'analyze_cm.json') -confusion_matrix_sink = beam.io.TextFileSink(confusion_matrix_file, shard_name_template='') -confusion_matrix | beam.io.Write('WriteConfusionMatrix', confusion_matrix_sink) -""" % (output_prediction_name, score_value, metadata_path) - - content_run = """pipeline.run()""" - - content = \ -""" -# header -%s -# imports -%s -# defines -%s -# evaluation -%s -# analysis -%s -# run pipeline -%s -""" % (content_header, content_imports, content_definitions, - content_evaluation, content_analysis, content_run) - - if output_analysis: - content += """ -# View Confusion Matrix with the following code: -# -# import datalab.mlalpha -# import yaml -# with ml.util._file.open_local_or_gcs(confusion_matrix_file, 'r') as f: -# data = [yaml.load(line) for line in f.read().rstrip().split('\\n')] -# datalab.mlalpha.ConfusionMatrix([d['predicted'] for d in data], -# [d['target'] for d in data], -# [d['count'] for d in data]).plot() -""" - IPython.get_ipython().set_next_input(content, replace=True) - - -def _evaluate(args, cell): - if not cell: - _output_evaluate_template(args['cloud']) - return - - env = datalab.utils.commands.notebook_environment() - config = datalab.utils.commands.parse_config(cell, env) - datalab.utils.commands.validate_config(config, - ['preprocessed_eval_data_path', 'metadata_path', 'model_dir', 'output_dir'], - optional_keys=['output_prediction_name', 'output_score_name']) - command = '%%mlalpha evaluate' - if args['cloud']: - command += ' --cloud' - command += '\n' + cell - _output_evaluate_code_template(command, args['cloud'], config['preprocessed_eval_data_path'], - config['metadata_path'], config['model_dir'], config['output_dir'], - output_prediction_name=config.get('output_prediction_name', None), - output_score_name=config.get('output_score_name', None)) - - -def _output_dataset_template(name): - content = """%%mlalpha dataset --name %s -source: - data1: data_local_or_gcs_path - data2: data_local_or_gcs_path -featureset: your-featureset-class-name -""" % name - IPython.get_ipython().set_next_input(content, replace=True) - - -def _dataset(args, cell): - if not cell: - _output_dataset_template(args['name']) - return - env = datalab.utils.commands.notebook_environment() - config = datalab.utils.commands.parse_config(cell, env) - datalab.utils.commands.validate_config(config, ['source', 'featureset'], - optional_keys=['format']) - if config['featureset'] not in env: - raise Exception('"%s" is not defined.' % config['featureset']) - featureset_class = env[config['featureset']] - format = config.get('format', 'csv') - ds = datalab.mlalpha.DataSet(featureset_class(), config['source'], format=format) - env[args['name']] = ds - - -def _module(args, cell): - if not cell: - raise Exception('Expect code in cell.') - return - - env = datalab.utils.commands.notebook_environment() - if '_ml_modules_' not in env: - modules = {} - env['_ml_modules_'] = modules - modules = env['_ml_modules_'] - modules[args['name']] = cell - if args['main']: - env['_ml_modules_main_'] = args['name'] - - -def _package(args, cell): - env = datalab.utils.commands.notebook_environment() - if '_ml_modules_' not in env: - raise Exception('No ml modules defined. Expect modules defined with "%%mlalpha module"') - package_path = datalab.mlalpha.Packager().package(env['_ml_modules_'], args['name']) - google.cloud.ml.util._file.create_directory(args['output']) - dest = os.path.join(args['output'], os.path.basename(package_path)) - google.cloud.ml.util._file.copy_file(package_path, dest) - os.remove(package_path) - print 'Package created at %s.' % dest diff --git a/datalab/mlalpha/commands/_tensorboard.py b/datalab/mlalpha/commands/_tensorboard.py deleted file mode 100644 index c789f97b2..000000000 --- a/datalab/mlalpha/commands/_tensorboard.py +++ /dev/null @@ -1,71 +0,0 @@ -# Copyright 2016 Google Inc. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. - -try: - import IPython - import IPython.core.magic -except ImportError: - raise Exception('This module can only be loaded in ipython.') - -import datalab.mlalpha -import datalab.utils.commands - - -@IPython.core.magic.register_line_cell_magic -def tensorboard(line, cell=None): - """Implements the tensorboard cell magic. - - Args: - line: the contents of the tensorboard line. - Returns: - The results of executing the cell. - """ - parser = datalab.utils.commands.CommandParser(prog='tensorboard', description=""" -Execute tensorboard operations. Use "%tensorboard -h" for help on a specific command. -""") - list_parser = parser.subcommand('list', 'List running TensorBoard instances.') - list_parser.set_defaults(func=_list) - start_parser = parser.subcommand('start', 'Start a TensorBoard server with the given logdir.') - start_parser.add_argument('--logdir', - help='The directory containing TensorFlow events. ' + - 'Can be a GCS or local path.', - required=True) - start_parser.set_defaults(func=_start) - stop_parser = parser.subcommand('stop', 'Stop a TensorBoard server with the given pid.') - stop_parser.add_argument('--pid', - help='The pid of the TensorBoard instance to stop.', - required=True) - stop_parser.set_defaults(func=_stop) - namespace = datalab.utils.commands.notebook_environment() - return datalab.utils.commands.handle_magic_line(line, cell, parser, namespace=namespace) - - -def _list(args, _): - """ List the running TensorBoard instances. """ - return datalab.utils.commands.render_dictionary( - datalab.mlalpha.TensorBoardManager.get_running_list(), - ['pid', 'logdir', 'port']) - - -def _start(args, _): - """ Start a TensorBoard instance. """ - pid, port = datalab.mlalpha.TensorBoardManager.start(args['logdir']) - url = datalab.mlalpha.TensorBoardManager.get_reverse_proxy_url(port) - html = '

TensorBoard was started successfully with pid %d. ' % pid - html += 'Click here to access it.

' % url - return IPython.core.display.HTML(html) - - -def _stop(args, _): - """ Stop a TensorBoard instance. """ - datalab.mlalpha.TensorBoardManager.stop(int(args['pid'])) - diff --git a/datalab/notebook/static/extern/lantern-browser.html b/datalab/notebook/static/extern/lantern-browser.html new file mode 100644 index 000000000..954b67f1f --- /dev/null +++ b/datalab/notebook/static/extern/lantern-browser.html @@ -0,0 +1,5424 @@ + + + + + + + + diff --git a/datalab/utils/_iterator.py b/datalab/utils/_iterator.py index 0db29485c..3d887bb38 100644 --- a/datalab/utils/_iterator.py +++ b/datalab/utils/_iterator.py @@ -37,7 +37,8 @@ def __iter__(self): self._page_token = next_page_token self._first_page = False - self._count += len(items) + if self._count == 0: + self._count = len(items) for item in items: yield item diff --git a/docs/gen-magic-rst.ipy b/docs/gen-magic-rst.ipy index a44d6cec0..dc980a9c5 100644 --- a/docs/gen-magic-rst.ipy +++ b/docs/gen-magic-rst.ipy @@ -1,9 +1,8 @@ import subprocess, pkgutil, importlib, sys from cStringIO import StringIO -# ignore mlalpha and tensorboard for now because of their tensorflow dependency -# until tensorboard is pip installable and can be listed as a pydatalab dependency -IGNORED_MAGICS = ['mlalpha', 'tensorboard'] + +IGNORED_MAGICS = [] # import submodules submodules = [s for _,s,_ in pkgutil.iter_modules(['../datalab'])] diff --git a/setup.py b/setup.py index 145e61c36..f8404392b 100644 --- a/setup.py +++ b/setup.py @@ -51,8 +51,7 @@ 'datalab.data', 'datalab.data.commands', 'datalab.kernel', - 'datalab.mlalpha', - 'datalab.mlalpha.commands', + 'datalab.ml', 'datalab.notebook', 'datalab.stackdriver', 'datalab.stackdriver.commands', @@ -106,6 +105,7 @@ 'pyyaml==3.11', 'requests==2.9.1', 'ipykernel==4.4.1', + 'psutil==4.3.0', 'jsonschema==2.6.0', ], package_data={ @@ -140,6 +140,7 @@ 'static/extern/d3.parcoords.js', 'static/extern/d3.parcoords.css', 'static/extern/sylvester.js', + 'static/extern/lantern-browser.html', ] } ) diff --git a/datalab/mlalpha/commands/__init__.py b/solutionbox/inception/datalab_solutions/__init__.py similarity index 79% rename from datalab/mlalpha/commands/__init__.py rename to solutionbox/inception/datalab_solutions/__init__.py index 3a5c1044b..3d74130ef 100644 --- a/datalab/mlalpha/commands/__init__.py +++ b/solutionbox/inception/datalab_solutions/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2016 Google Inc. All rights reserved. +# Copyright 2017 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -10,8 +10,3 @@ # or implied. See the License for the specific language governing permissions and limitations under # the License. - -from __future__ import absolute_import - -from . import _mlalpha -from . import _tensorboard diff --git a/solutionbox/inception/datalab_solutions/inception/__init__.py b/solutionbox/inception/datalab_solutions/inception/__init__.py new file mode 100644 index 000000000..12407cb30 --- /dev/null +++ b/solutionbox/inception/datalab_solutions/inception/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2017 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. + + +from ._package import local_preprocess, cloud_preprocess, local_train, cloud_train, local_predict, \ + cloud_predict, local_batch_predict, cloud_batch_predict diff --git a/solutionbox/inception/datalab_solutions/inception/_cloud.py b/solutionbox/inception/datalab_solutions/inception/_cloud.py new file mode 100644 index 000000000..e14f8e1d1 --- /dev/null +++ b/solutionbox/inception/datalab_solutions/inception/_cloud.py @@ -0,0 +1,157 @@ +# Copyright 2017 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +"""Cloud implementation for preprocessing, training and prediction for inception model. +""" + +import apache_beam as beam +import base64 +import collections +import datetime +import logging +import os + + +from . import _model +from . import _predictor +from . import _preprocess +from . import _trainer +from . import _util + + +class Cloud(object): + """Class for cloud training, preprocessing and prediction.""" + + def __init__(self, checkpoint=None): + self._checkpoint = checkpoint + if self._checkpoint is None: + self._checkpoint = _util._DEFAULT_CHECKPOINT_GSURL + + def _repackage_to_staging(self, output_path): + """Repackage inception from local installed location and copy it to GCS. + """ + + import datalab.ml as ml + + # Find the package root. __file__ is under [package_root]/datalab_solutions/inception. + package_root = os.path.join(os.path.dirname(__file__), '../../') + # We deploy setup.py in the same dir for repackaging purpose. + setup_py = os.path.join(os.path.dirname(__file__), 'setup.py') + staging_package_url = os.path.join(output_path, 'staging', 'inception.tar.gz') + ml.package_and_copy(package_root, setup_py, staging_package_url) + return staging_package_url + + def preprocess(self, train_dataset, eval_dataset, output_dir, pipeline_option): + """Cloud preprocessing with Cloud DataFlow.""" + + import datalab.ml as ml + + job_name = 'preprocess-inception-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S') + staging_package_url = self._repackage_to_staging(output_dir) + options = { + 'staging_location': os.path.join(output_dir, 'tmp', 'staging'), + 'temp_location': os.path.join(output_dir, 'tmp'), + 'job_name': job_name, + 'project': _util.default_project(), + 'extra_packages': [staging_package_url], + 'teardown_policy': 'TEARDOWN_ALWAYS', + 'no_save_main_session': True + } + if pipeline_option is not None: + options.update(pipeline_option) + + opts = beam.pipeline.PipelineOptions(flags=[], **options) + p = beam.Pipeline('DataflowRunner', options=opts) + _preprocess.configure_pipeline(p, train_dataset, eval_dataset, self._checkpoint, + output_dir, job_name) + p.run() + return job_name + + def train(self, input_dir, batch_size, max_steps, output_path, cloud_train_config): + """Cloud training with CloudML trainer service.""" + + import datalab.ml as ml + + staging_package_url = self._repackage_to_staging(output_path) + job_args = { + 'input_dir': input_dir, + 'output_path': output_path, + 'max_steps': max_steps, + 'batch_size': batch_size, + 'checkpoint': self._checkpoint + } + job_request = { + 'package_uris': [staging_package_url], + 'python_module': 'datalab_solutions.inception.task', + 'args': job_args + } + job_request.update(dict(cloud_train_config._asdict())) + job_id = 'inception_train_' + datetime.datetime.now().strftime('%y%m%d_%H%M%S') + job = ml.Job.submit_training(job_request, job_id) + return job + + def predict(self, model_id, images): + """Cloud prediction with CloudML prediction service.""" + + import datalab.ml as ml + parts = model_id.split('.') + if len(parts) != 2: + raise ValueError('Invalid model name for cloud prediction. Use "model.version".') + if len(images) == 0: + raise ValueError('images is empty.') + + data = [] + for ii, image in enumerate(images): + image_encoded = base64.b64encode(image) + data.append({ + 'key': str(ii), + 'image_bytes': {'b64': image_encoded} + }) + + predictions = ml.ModelVersions(parts[0]).predict(parts[1], data) + if len(predictions) == 0: + raise Exception('Prediction results are empty.') + # Although prediction results contains a labels list in each instance, they are all the same + # so taking the first one. + labels = predictions[0]['labels'] + labels_and_scores = [(x['prediction'], x['scores'][labels.index(x['prediction'])]) + for x in predictions] + return labels_and_scores + + def batch_predict(self, dataset, model_dir, gcs_staging_location, output_csv, + output_bq_table, pipeline_option): + """Cloud batch prediction with a model specified by a GCS directory.""" + + import datalab.ml as ml + + job_name = 'batch-predict-inception-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S') + staging_package_url = self._repackage_to_staging(gcs_staging_location) + options = { + 'staging_location': os.path.join(gcs_staging_location, 'tmp', 'staging'), + 'temp_location': os.path.join(gcs_staging_location, 'tmp'), + 'job_name': job_name, + 'project': _util.default_project(), + 'extra_packages': [staging_package_url], + 'teardown_policy': 'TEARDOWN_ALWAYS', + 'no_save_main_session': True + } + if pipeline_option is not None: + options.update(pipeline_option) + + opts = beam.pipeline.PipelineOptions(flags=[], **options) + p = beam.Pipeline('DataflowRunner', options=opts) + _predictor.configure_pipeline(p, dataset, model_dir, output_csv, output_bq_table) + p.run() + return job_name diff --git a/solutionbox/inception/datalab_solutions/inception/_inceptionlib.py b/solutionbox/inception/datalab_solutions/inception/_inceptionlib.py new file mode 100644 index 000000000..1246ecb7d --- /dev/null +++ b/solutionbox/inception/datalab_solutions/inception/_inceptionlib.py @@ -0,0 +1,599 @@ +# Copyright 2017 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +"""Inception model building libraries. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + +slim = tf.contrib.slim +trunc_normal = lambda stddev: tf.truncated_normal_initializer(0.0, stddev) + + +def inception_v3_base(inputs, + final_endpoint='Mixed_7c', + min_depth=16, + depth_multiplier=1.0, + scope=None): + """Inception model from http://arxiv.org/abs/1512.00567. + + Constructs an Inception v3 network from inputs to the given final endpoint. + This method can construct the network up to the final inception block + Mixed_7c. + + Note that the names of the layers in the paper do not correspond to the names + of the endpoints registered by this function although they build the same + network. + + Here is a mapping from the old_names to the new names: + Old name | New name + ======================================= + conv0 | Conv2d_1a_3x3 + conv1 | Conv2d_2a_3x3 + conv2 | Conv2d_2b_3x3 + pool1 | MaxPool_3a_3x3 + conv3 | Conv2d_3b_1x1 + conv4 | Conv2d_4a_3x3 + pool2 | MaxPool_5a_3x3 + mixed_35x35x256a | Mixed_5b + mixed_35x35x288a | Mixed_5c + mixed_35x35x288b | Mixed_5d + mixed_17x17x768a | Mixed_6a + mixed_17x17x768b | Mixed_6b + mixed_17x17x768c | Mixed_6c + mixed_17x17x768d | Mixed_6d + mixed_17x17x768e | Mixed_6e + mixed_8x8x1280a | Mixed_7a + mixed_8x8x2048a | Mixed_7b + mixed_8x8x2048b | Mixed_7c + + Args: + inputs: a tensor of size [batch_size, height, width, channels]. + final_endpoint: specifies the endpoint to construct the network up to. It + can be one of ['Conv2d_1a_3x3', 'Conv2d_2a_3x3', 'Conv2d_2b_3x3', + 'MaxPool_3a_3x3', 'Conv2d_3b_1x1', 'Conv2d_4a_3x3', 'MaxPool_5a_3x3', + 'Mixed_5b', 'Mixed_5c', 'Mixed_5d', 'Mixed_6a', 'Mixed_6b', 'Mixed_6c', + 'Mixed_6d', 'Mixed_6e', 'Mixed_7a', 'Mixed_7b', 'Mixed_7c']. + min_depth: Minimum depth value (number of channels) for all convolution ops. + Enforced when depth_multiplier < 1, and not an active constraint when + depth_multiplier >= 1. + depth_multiplier: Float multiplier for the depth (number of channels) + for all convolution ops. The value must be greater than zero. Typical + usage will be to set this value in (0, 1) to reduce the number of + parameters or computation cost of the model. + scope: Optional variable_scope. + + Returns: + tensor_out: output tensor corresponding to the final_endpoint. + end_points: a set of activations for external use, for example summaries or + losses. + + Raises: + ValueError: if final_endpoint is not set to one of the predefined values, + or depth_multiplier <= 0 + """ + # end_points will collect relevant activations for external use, for example + # summaries or losses. + end_points = {} + + if depth_multiplier <= 0: + raise ValueError('depth_multiplier is not greater than zero.') + depth = lambda d: max(int(d * depth_multiplier), min_depth) + + with tf.variable_scope(scope, 'InceptionV3', [inputs]): + with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d], + stride=1, padding='VALID'): + # 299 x 299 x 3 + end_point = 'Conv2d_1a_3x3' + net = slim.conv2d(inputs, depth(32), [3, 3], stride=2, scope=end_point) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + # 149 x 149 x 32 + end_point = 'Conv2d_2a_3x3' + net = slim.conv2d(net, depth(32), [3, 3], scope=end_point) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + # 147 x 147 x 32 + end_point = 'Conv2d_2b_3x3' + net = slim.conv2d(net, depth(64), [3, 3], padding='SAME', scope=end_point) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + # 147 x 147 x 64 + end_point = 'MaxPool_3a_3x3' + net = slim.max_pool2d(net, [3, 3], stride=2, scope=end_point) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + # 73 x 73 x 64 + end_point = 'Conv2d_3b_1x1' + net = slim.conv2d(net, depth(80), [1, 1], scope=end_point) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + # 73 x 73 x 80. + end_point = 'Conv2d_4a_3x3' + net = slim.conv2d(net, depth(192), [3, 3], scope=end_point) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + # 71 x 71 x 192. + end_point = 'MaxPool_5a_3x3' + net = slim.max_pool2d(net, [3, 3], stride=2, scope=end_point) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + # 35 x 35 x 192. + + # Inception blocks + with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d], + stride=1, padding='SAME'): + # mixed: 35 x 35 x 256. + end_point = 'Mixed_5b' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d(net, depth(48), [1, 1], scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, depth(64), [5, 5], + scope='Conv2d_0b_5x5') + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d(branch_2, depth(96), [3, 3], + scope='Conv2d_0b_3x3') + branch_2 = slim.conv2d(branch_2, depth(96), [3, 3], + scope='Conv2d_0c_3x3') + with tf.variable_scope('Branch_3'): + branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') + branch_3 = slim.conv2d(branch_3, depth(32), [1, 1], + scope='Conv2d_0b_1x1') + net = tf.concat([branch_0, branch_1, branch_2, branch_3], 3) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + + # mixed_1: 35 x 35 x 288. + end_point = 'Mixed_5c' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d(net, depth(48), [1, 1], scope='Conv2d_0b_1x1') + branch_1 = slim.conv2d(branch_1, depth(64), [5, 5], + scope='Conv_1_0c_5x5') + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d(net, depth(64), [1, 1], + scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d(branch_2, depth(96), [3, 3], + scope='Conv2d_0b_3x3') + branch_2 = slim.conv2d(branch_2, depth(96), [3, 3], + scope='Conv2d_0c_3x3') + with tf.variable_scope('Branch_3'): + branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') + branch_3 = slim.conv2d(branch_3, depth(64), [1, 1], + scope='Conv2d_0b_1x1') + net = tf.concat([branch_0, branch_1, branch_2, branch_3], 3) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + + # mixed_2: 35 x 35 x 288. + end_point = 'Mixed_5d' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d(net, depth(48), [1, 1], scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, depth(64), [5, 5], + scope='Conv2d_0b_5x5') + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d(branch_2, depth(96), [3, 3], + scope='Conv2d_0b_3x3') + branch_2 = slim.conv2d(branch_2, depth(96), [3, 3], + scope='Conv2d_0c_3x3') + with tf.variable_scope('Branch_3'): + branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') + branch_3 = slim.conv2d(branch_3, depth(64), [1, 1], + scope='Conv2d_0b_1x1') + net = tf.concat([branch_0, branch_1, branch_2, branch_3], 3) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + + # mixed_3: 17 x 17 x 768. + end_point = 'Mixed_6a' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, depth(384), [3, 3], stride=2, + padding='VALID', scope='Conv2d_1a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, depth(96), [3, 3], + scope='Conv2d_0b_3x3') + branch_1 = slim.conv2d(branch_1, depth(96), [3, 3], stride=2, + padding='VALID', scope='Conv2d_1a_1x1') + with tf.variable_scope('Branch_2'): + branch_2 = slim.max_pool2d(net, [3, 3], stride=2, padding='VALID', + scope='MaxPool_1a_3x3') + net = tf.concat([branch_0, branch_1, branch_2], 3) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + + # mixed4: 17 x 17 x 768. + end_point = 'Mixed_6b' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d(net, depth(128), [1, 1], scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, depth(128), [1, 7], + scope='Conv2d_0b_1x7') + branch_1 = slim.conv2d(branch_1, depth(192), [7, 1], + scope='Conv2d_0c_7x1') + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d(net, depth(128), [1, 1], scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d(branch_2, depth(128), [7, 1], + scope='Conv2d_0b_7x1') + branch_2 = slim.conv2d(branch_2, depth(128), [1, 7], + scope='Conv2d_0c_1x7') + branch_2 = slim.conv2d(branch_2, depth(128), [7, 1], + scope='Conv2d_0d_7x1') + branch_2 = slim.conv2d(branch_2, depth(192), [1, 7], + scope='Conv2d_0e_1x7') + with tf.variable_scope('Branch_3'): + branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') + branch_3 = slim.conv2d(branch_3, depth(192), [1, 1], + scope='Conv2d_0b_1x1') + net = tf.concat([branch_0, branch_1, branch_2, branch_3], 3) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + + # mixed_5: 17 x 17 x 768. + end_point = 'Mixed_6c' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d(net, depth(160), [1, 1], scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, depth(160), [1, 7], + scope='Conv2d_0b_1x7') + branch_1 = slim.conv2d(branch_1, depth(192), [7, 1], + scope='Conv2d_0c_7x1') + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d(net, depth(160), [1, 1], scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d(branch_2, depth(160), [7, 1], + scope='Conv2d_0b_7x1') + branch_2 = slim.conv2d(branch_2, depth(160), [1, 7], + scope='Conv2d_0c_1x7') + branch_2 = slim.conv2d(branch_2, depth(160), [7, 1], + scope='Conv2d_0d_7x1') + branch_2 = slim.conv2d(branch_2, depth(192), [1, 7], + scope='Conv2d_0e_1x7') + with tf.variable_scope('Branch_3'): + branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') + branch_3 = slim.conv2d(branch_3, depth(192), [1, 1], + scope='Conv2d_0b_1x1') + net = tf.concat([branch_0, branch_1, branch_2, branch_3], 3) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + # mixed_6: 17 x 17 x 768. + end_point = 'Mixed_6d' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d(net, depth(160), [1, 1], scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, depth(160), [1, 7], + scope='Conv2d_0b_1x7') + branch_1 = slim.conv2d(branch_1, depth(192), [7, 1], + scope='Conv2d_0c_7x1') + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d(net, depth(160), [1, 1], scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d(branch_2, depth(160), [7, 1], + scope='Conv2d_0b_7x1') + branch_2 = slim.conv2d(branch_2, depth(160), [1, 7], + scope='Conv2d_0c_1x7') + branch_2 = slim.conv2d(branch_2, depth(160), [7, 1], + scope='Conv2d_0d_7x1') + branch_2 = slim.conv2d(branch_2, depth(192), [1, 7], + scope='Conv2d_0e_1x7') + with tf.variable_scope('Branch_3'): + branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') + branch_3 = slim.conv2d(branch_3, depth(192), [1, 1], + scope='Conv2d_0b_1x1') + net = tf.concat([branch_0, branch_1, branch_2, branch_3], 3) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + + # mixed_7: 17 x 17 x 768. + end_point = 'Mixed_6e' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, depth(192), [1, 7], + scope='Conv2d_0b_1x7') + branch_1 = slim.conv2d(branch_1, depth(192), [7, 1], + scope='Conv2d_0c_7x1') + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d(branch_2, depth(192), [7, 1], + scope='Conv2d_0b_7x1') + branch_2 = slim.conv2d(branch_2, depth(192), [1, 7], + scope='Conv2d_0c_1x7') + branch_2 = slim.conv2d(branch_2, depth(192), [7, 1], + scope='Conv2d_0d_7x1') + branch_2 = slim.conv2d(branch_2, depth(192), [1, 7], + scope='Conv2d_0e_1x7') + with tf.variable_scope('Branch_3'): + branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') + branch_3 = slim.conv2d(branch_3, depth(192), [1, 1], + scope='Conv2d_0b_1x1') + net = tf.concat([branch_0, branch_1, branch_2, branch_3], 3) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + + # mixed_8: 8 x 8 x 1280. + end_point = 'Mixed_7a' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') + branch_0 = slim.conv2d(branch_0, depth(320), [3, 3], stride=2, + padding='VALID', scope='Conv2d_1a_3x3') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, depth(192), [1, 7], + scope='Conv2d_0b_1x7') + branch_1 = slim.conv2d(branch_1, depth(192), [7, 1], + scope='Conv2d_0c_7x1') + branch_1 = slim.conv2d(branch_1, depth(192), [3, 3], stride=2, + padding='VALID', scope='Conv2d_1a_3x3') + with tf.variable_scope('Branch_2'): + branch_2 = slim.max_pool2d(net, [3, 3], stride=2, padding='VALID', + scope='MaxPool_1a_3x3') + net = tf.concat([branch_0, branch_1, branch_2], 3) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + # mixed_9: 8 x 8 x 2048. + end_point = 'Mixed_7b' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, depth(320), [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d(net, depth(384), [1, 1], scope='Conv2d_0a_1x1') + branch_1 = tf.concat([ + slim.conv2d(branch_1, depth(384), [1, 3], scope='Conv2d_0b_1x3'), + slim.conv2d(branch_1, depth(384), [3, 1], scope='Conv2d_0b_3x1')], 3) + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d(net, depth(448), [1, 1], scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d( + branch_2, depth(384), [3, 3], scope='Conv2d_0b_3x3') + branch_2 = tf.concat([ + slim.conv2d(branch_2, depth(384), [1, 3], scope='Conv2d_0c_1x3'), + slim.conv2d(branch_2, depth(384), [3, 1], scope='Conv2d_0d_3x1')], 3) + with tf.variable_scope('Branch_3'): + branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') + branch_3 = slim.conv2d( + branch_3, depth(192), [1, 1], scope='Conv2d_0b_1x1') + net = tf.concat([branch_0, branch_1, branch_2, branch_3], 3) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + + # mixed_10: 8 x 8 x 2048. + end_point = 'Mixed_7c' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, depth(320), [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d(net, depth(384), [1, 1], scope='Conv2d_0a_1x1') + branch_1 = tf.concat([ + slim.conv2d(branch_1, depth(384), [1, 3], scope='Conv2d_0b_1x3'), + slim.conv2d(branch_1, depth(384), [3, 1], scope='Conv2d_0c_3x1')], 3) + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d(net, depth(448), [1, 1], scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d( + branch_2, depth(384), [3, 3], scope='Conv2d_0b_3x3') + branch_2 = tf.concat([ + slim.conv2d(branch_2, depth(384), [1, 3], scope='Conv2d_0c_1x3'), + slim.conv2d(branch_2, depth(384), [3, 1], scope='Conv2d_0d_3x1')], 3) + with tf.variable_scope('Branch_3'): + branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') + branch_3 = slim.conv2d( + branch_3, depth(192), [1, 1], scope='Conv2d_0b_1x1') + net = tf.concat([branch_0, branch_1, branch_2, branch_3], 3) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + raise ValueError('Unknown final endpoint %s' % final_endpoint) + + +def inception_v3(inputs, + num_classes=1000, + is_training=True, + dropout_keep_prob=0.8, + min_depth=16, + depth_multiplier=1.0, + prediction_fn=slim.softmax, + spatial_squeeze=True, + reuse=None, + scope='InceptionV3'): + """Inception model from http://arxiv.org/abs/1512.00567. + + "Rethinking the Inception Architecture for Computer Vision" + + Christian Szegedy, Vincent Vanhoucke, Sergey Ioffe, Jonathon Shlens, + Zbigniew Wojna. + + With the default arguments this method constructs the exact model defined in + the paper. However, one can experiment with variations of the inception_v3 + network by changing arguments dropout_keep_prob, min_depth and + depth_multiplier. + + The default image size used to train this network is 299x299. + + Args: + inputs: a tensor of size [batch_size, height, width, channels]. + num_classes: number of predicted classes. + is_training: whether is training or not. + dropout_keep_prob: the percentage of activation values that are retained. + min_depth: Minimum depth value (number of channels) for all convolution ops. + Enforced when depth_multiplier < 1, and not an active constraint when + depth_multiplier >= 1. + depth_multiplier: Float multiplier for the depth (number of channels) + for all convolution ops. The value must be greater than zero. Typical + usage will be to set this value in (0, 1) to reduce the number of + parameters or computation cost of the model. + prediction_fn: a function to get predictions out of logits. + spatial_squeeze: if True, logits is of shape is [B, C], if false logits is + of shape [B, 1, 1, C], where B is batch_size and C is number of classes. + reuse: whether or not the network and its variables should be reused. To be + able to reuse 'scope' must be given. + scope: Optional variable_scope. + + Returns: + logits: the pre-softmax activations, a tensor of size + [batch_size, num_classes] + end_points: a dictionary from components of the network to the corresponding + activation. + + Raises: + ValueError: if 'depth_multiplier' is less than or equal to zero. + """ + if depth_multiplier <= 0: + raise ValueError('depth_multiplier is not greater than zero.') + depth = lambda d: max(int(d * depth_multiplier), min_depth) + + with tf.variable_scope(scope, 'InceptionV3', [inputs, num_classes], + reuse=reuse) as scope: + with slim.arg_scope([slim.batch_norm, slim.dropout], + is_training=is_training): + net, end_points = inception_v3_base( + inputs, scope=scope, min_depth=min_depth, + depth_multiplier=depth_multiplier) + + # Auxiliary Head logits + with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d], + stride=1, padding='SAME'): + aux_logits = end_points['Mixed_6e'] + with tf.variable_scope('AuxLogits'): + aux_logits = slim.avg_pool2d( + aux_logits, [5, 5], stride=3, padding='VALID', + scope='AvgPool_1a_5x5') + aux_logits = slim.conv2d(aux_logits, depth(128), [1, 1], + scope='Conv2d_1b_1x1') + + # Shape of feature map before the final layer. + kernel_size = _reduced_kernel_size_for_small_input( + aux_logits, [5, 5]) + aux_logits = slim.conv2d( + aux_logits, depth(768), kernel_size, + weights_initializer=trunc_normal(0.01), + padding='VALID', scope='Conv2d_2a_{}x{}'.format(*kernel_size)) + aux_logits = slim.conv2d( + aux_logits, num_classes, [1, 1], activation_fn=None, + normalizer_fn=None, weights_initializer=trunc_normal(0.001), + scope='Conv2d_2b_1x1') + if spatial_squeeze: + aux_logits = tf.squeeze(aux_logits, [1, 2], name='SpatialSqueeze') + end_points['AuxLogits'] = aux_logits + + # Final pooling and prediction + with tf.variable_scope('Logits'): + kernel_size = _reduced_kernel_size_for_small_input(net, [8, 8]) + net = slim.avg_pool2d(net, kernel_size, padding='VALID', + scope='AvgPool_1a_{}x{}'.format(*kernel_size)) + # 1 x 1 x 2048 + net = slim.dropout(net, keep_prob=dropout_keep_prob, scope='Dropout_1b') + end_points['PreLogits'] = net + # 2048 + logits = slim.conv2d(net, num_classes, [1, 1], activation_fn=None, + normalizer_fn=None, scope='Conv2d_1c_1x1') + if spatial_squeeze: + logits = tf.squeeze(logits, [1, 2], name='SpatialSqueeze') + # 1000 + end_points['Logits'] = logits + end_points['Predictions'] = prediction_fn(logits, scope='Predictions') + return logits, end_points +inception_v3.default_image_size = 299 + + +def _reduced_kernel_size_for_small_input(input_tensor, kernel_size): + """Define kernel size which is automatically reduced for small input. + + If the shape of the input images is unknown at graph construction time this + function assumes that the input images are is large enough. + + Args: + input_tensor: input tensor of size [batch_size, height, width, channels]. + kernel_size: desired kernel size of length 2: [kernel_height, kernel_width] + + Returns: + a tensor with the kernel size. + + TODO(jrru): Make this function work with unknown shapes. Theoretically, this + can be done with the code below. Problems are two-fold: (1) If the shape was + known, it will be lost. (2) inception.slim.ops._two_element_tuple cannot + handle tensors that define the kernel size. + shape = tf.shape(input_tensor) + return = tf.stack([tf.minimum(shape[1], kernel_size[0]), + tf.minimum(shape[2], kernel_size[1])]) + + """ + shape = input_tensor.get_shape().as_list() + if shape[1] is None or shape[2] is None: + kernel_size_out = kernel_size + else: + kernel_size_out = [min(shape[1], kernel_size[0]), + min(shape[2], kernel_size[1])] + return kernel_size_out + + +def inception_v3_arg_scope(weight_decay=0.00004, + stddev=0.1, + batch_norm_var_collection='moving_vars'): + """Defines the default InceptionV3 arg scope. + + Args: + weight_decay: The weight decay to use for regularizing the model. + stddev: The standard deviation of the trunctated normal weight initializer. + batch_norm_var_collection: The name of the collection for the batch norm + variables. + + Returns: + An `arg_scope` to use for the inception v3 model. + """ + batch_norm_params = { + # Decay for the moving averages. + 'decay': 0.9997, + # epsilon to prevent 0s in variance. + 'epsilon': 0.001, + # collection containing update_ops. + 'updates_collections': tf.GraphKeys.UPDATE_OPS, + # collection containing the moving mean and moving variance. + 'variables_collections': { + 'beta': None, + 'gamma': None, + 'moving_mean': [batch_norm_var_collection], + 'moving_variance': [batch_norm_var_collection], + } + } + + # Set weight_decay for weights in Conv and FC layers. + with slim.arg_scope([slim.conv2d, slim.fully_connected], + weights_regularizer=slim.l2_regularizer(weight_decay)): + with slim.arg_scope( + [slim.conv2d], + weights_initializer=tf.truncated_normal_initializer(stddev=stddev), + activation_fn=tf.nn.relu, + normalizer_fn=slim.batch_norm, + normalizer_params=batch_norm_params) as sc: + return sc diff --git a/solutionbox/inception/datalab_solutions/inception/_local.py b/solutionbox/inception/datalab_solutions/inception/_local.py new file mode 100644 index 000000000..63d50a332 --- /dev/null +++ b/solutionbox/inception/datalab_solutions/inception/_local.py @@ -0,0 +1,86 @@ +# Copyright 2017 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +"""Local implementation for preprocessing, training and prediction for inception model. +""" + +import apache_beam as beam +import collections +import csv +import datetime +import json +import os +import tensorflow as tf +import yaml + + +from . import _model +from . import _predictor +from . import _preprocess +from . import _trainer +from . import _util + + +class Local(object): + """Class for local training, preprocessing and prediction.""" + + def __init__(self, checkpoint=None): + self._checkpoint = checkpoint + if self._checkpoint is None: + self._checkpoint = _util._DEFAULT_CHECKPOINT_GSURL + + def preprocess(self, train_dataset, eval_dataset, output_dir): + """Local preprocessing with local DataFlow.""" + + import datalab.ml as ml + job_id = 'inception_preprocessed_' + datetime.datetime.now().strftime('%y%m%d_%H%M%S') + # Project is needed for bigquery data source, even in local run. + options = { + 'project': _util.default_project(), + } + opts = beam.pipeline.PipelineOptions(flags=[], **options) + p = beam.Pipeline('DirectRunner', options=opts) + _preprocess.configure_pipeline(p, train_dataset, eval_dataset, + self._checkpoint, output_dir, job_id) + p.run().wait_until_finish() + + def train(self, input_dir, batch_size, max_steps, output_dir): + """Local training.""" + + labels = _util.get_labels(input_dir) + model = _model.Model(labels, 0.5, self._checkpoint) + task_data = {'type': 'master', 'index': 0} + task = type('TaskSpec', (object,), task_data) + _trainer.Trainer(input_dir, batch_size, max_steps, output_dir, + model, None, task).run_training() + + def predict(self, model_dir, images): + """Local prediction.""" + + return _predictor.predict(model_dir, images) + + + def batch_predict(self, dataset, model_dir, output_csv, output_bq_table): + """Local batch prediction.""" + import datalab.ml as ml + job_id = 'inception_batch_predict_' + datetime.datetime.now().strftime('%y%m%d_%H%M%S') + # Project is needed for bigquery data source, even in local run. + options = { + 'project': _util.default_project(), + } + opts = beam.pipeline.PipelineOptions(flags=[], **options) + p = beam.Pipeline('DirectRunner', options=opts) + _predictor.configure_pipeline(p, dataset, model_dir, output_csv, output_bq_table) + p.run().wait_until_finish() diff --git a/solutionbox/inception/datalab_solutions/inception/_model.py b/solutionbox/inception/datalab_solutions/inception/_model.py new file mode 100644 index 000000000..ad328bdfd --- /dev/null +++ b/solutionbox/inception/datalab_solutions/inception/_model.py @@ -0,0 +1,390 @@ +# Copyright 2017 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +"""Inception model tensorflow implementation. +""" + +from enum import Enum +import json +import logging +import os +import tensorflow as tf +from tensorflow.contrib import layers + +from . import _inceptionlib +from . import _util + + +slim = tf.contrib.slim + +LOGITS_TENSOR_NAME = 'logits_tensor' +IMAGE_URI_COLUMN = 'image_uri' +LABEL_COLUMN = 'label' +EMBEDDING_COLUMN = 'embedding' + +BOTTLENECK_TENSOR_SIZE = 2048 + + +class GraphMod(Enum): + TRAIN = 1 + EVALUATE = 2 + PREDICT = 3 + + +class GraphReferences(object): + """Holder of base tensors used for training model using common task.""" + + def __init__(self): + self.examples = None + self.train = None + self.global_step = None + self.metric_updates = [] + self.metric_values = [] + self.keys = None + self.predictions = [] + + +class Model(object): + """TensorFlow model for the flowers problem.""" + + def __init__(self, labels, dropout, inception_checkpoint_file): + self.labels = labels + self.labels.sort() + self.dropout = dropout + self.inception_checkpoint_file = inception_checkpoint_file + + def add_final_training_ops(self, + embeddings, + all_labels_count, + bottleneck_tensor_size, + hidden_layer_size=BOTTLENECK_TENSOR_SIZE / 4, + dropout_keep_prob=None): + """Adds a new softmax and fully-connected layer for training. + + The set up for the softmax and fully-connected layers is based on: + https://tensorflow.org/versions/master/tutorials/mnist/beginners/index.html + + This function can be customized to add arbitrary layers for + application-specific requirements. + Args: + embeddings: The embedding (bottleneck) tensor. + all_labels_count: The number of all labels including the default label. + bottleneck_tensor_size: The number of embeddings. + hidden_layer_size: The size of the hidden_layer. Roughtly, 1/4 of the + bottleneck tensor size. + dropout_keep_prob: the percentage of activation values that are retained. + Returns: + softmax: The softmax or tensor. It stores the final scores. + logits: The logits tensor. + """ + with tf.name_scope('input'): + bottleneck_input = tf.placeholder_with_default( + embeddings, + shape=[None, bottleneck_tensor_size], + name='ReshapeSqueezed') + bottleneck_with_no_gradient = tf.stop_gradient(bottleneck_input) + + with tf.name_scope('Wx_plus_b'): + hidden = layers.fully_connected(bottleneck_with_no_gradient, + hidden_layer_size) + # We need a dropout when the size of the dataset is rather small. + if dropout_keep_prob: + hidden = tf.nn.dropout(hidden, dropout_keep_prob) + logits = layers.fully_connected( + hidden, all_labels_count, activation_fn=None) + + softmax = tf.nn.softmax(logits, name='softmax') + return softmax, logits + + def build_inception_graph(self): + """Builds an inception graph and add the necessary input & output tensors. + + To use other Inception models modify this file. Also preprocessing must be + modified accordingly. + + See tensorflow/contrib/slim/python/slim/nets/inception_v3.py for + details about InceptionV3. + + Returns: + input_jpeg: A placeholder for jpeg string batch that allows feeding the + Inception layer with image bytes for prediction. + inception_embeddings: The embeddings tensor. + """ + image_str_tensor = tf.placeholder(tf.string, shape=[None]) + + # The CloudML Prediction API always "feeds" the Tensorflow graph with + # dynamic batch sizes e.g. (?,). decode_jpeg only processes scalar + # strings because it cannot guarantee a batch of images would have + # the same output size. We use tf.map_fn to give decode_jpeg a scalar + # string from dynamic batches. + image = tf.map_fn( + _util.decode_and_resize, image_str_tensor, back_prop=False, dtype=tf.uint8) + # convert_image_dtype, also scales [0, uint8_max] -> [0 ,1). + image = tf.image.convert_image_dtype(image, dtype=tf.float32) + + # Then shift images to [-1, 1) for Inception. + image = tf.subtract(image, 0.5) + image = tf.multiply(image, 2.0) + + # Build Inception layers, which expect A tensor of type float from [-1, 1) + # and shape [batch_size, height, width, channels]. + with slim.arg_scope(_inceptionlib.inception_v3_arg_scope()): + _, end_points = _inceptionlib.inception_v3(image, is_training=False) + + inception_embeddings = end_points['PreLogits'] + inception_embeddings = tf.squeeze( + inception_embeddings, [1, 2], name='SpatialSqueeze') + return image_str_tensor, inception_embeddings + + def build_graph(self, data_paths, batch_size, graph_mod): + """Builds generic graph for training or eval.""" + tensors = GraphReferences() + is_training = graph_mod == GraphMod.TRAIN + if data_paths: + _, tensors.examples = _util.read_examples( + data_paths, + batch_size, + shuffle=is_training, + num_epochs=None if is_training else 2) + else: + tensors.examples = tf.placeholder(tf.string, name='input', shape=(None,)) + + if graph_mod == GraphMod.PREDICT: + inception_input, inception_embeddings = self.build_inception_graph() + # Build the Inception graph. We later add final training layers + # to this graph. This is currently used only for prediction. + # For training, we use pre-processed data, so it is not needed. + embeddings = inception_embeddings + tensors.input_jpeg = inception_input + else: + # For training and evaluation we assume data is preprocessed, so the + # inputs are tf-examples. + # Generate placeholders for examples. + with tf.name_scope('inputs'): + feature_map = { + 'image_uri': + tf.FixedLenFeature( + shape=[], dtype=tf.string, default_value=['']), + # Some images may have no labels. For those, we assume a default + # label. So the number of labels is label_count+1 for the default + # label. + 'label': + tf.FixedLenFeature( + shape=[1], dtype=tf.int64, + default_value=[len(self.labels)]), + 'embedding': + tf.FixedLenFeature( + shape=[BOTTLENECK_TENSOR_SIZE], dtype=tf.float32) + } + parsed = tf.parse_example(tensors.examples, features=feature_map) + labels = tf.squeeze(parsed['label']) + uris = tf.squeeze(parsed['image_uri']) + embeddings = parsed['embedding'] + + # We assume a default label, so the total number of labels is equal to + # label_count+1. + all_labels_count = len(self.labels) + 1 + with tf.name_scope('final_ops'): + softmax, logits = self.add_final_training_ops( + embeddings, + all_labels_count, + BOTTLENECK_TENSOR_SIZE, + dropout_keep_prob=self.dropout if is_training else None) + + # Prediction is the index of the label with the highest score. We are + # interested only in the top score. + prediction = tf.argmax(softmax, 1) + tensors.predictions = [prediction, softmax, embeddings] + + if graph_mod == GraphMod.PREDICT: + return tensors + + with tf.name_scope('evaluate'): + loss_value = loss(logits, labels) + + # Add to the Graph the Ops that calculate and apply gradients. + if is_training: + tensors.train, tensors.global_step = training(loss_value) + else: + tensors.global_step = tf.Variable(0, name='global_step', trainable=False) + tensors.uris = uris + + # Add means across all batches. + loss_updates, loss_op = _util.loss(loss_value) + accuracy_updates, accuracy_op = _util.accuracy(logits, labels) + + if not is_training: + tf.summary.scalar('accuracy', accuracy_op) + tf.summary.scalar('loss', loss_op) + + tensors.metric_updates = loss_updates + accuracy_updates + tensors.metric_values = [loss_op, accuracy_op] + return tensors + + + def build_train_graph(self, data_paths, batch_size): + return self.build_graph(data_paths, batch_size, GraphMod.TRAIN) + + def build_eval_graph(self, data_paths, batch_size): + return self.build_graph(data_paths, batch_size, GraphMod.EVALUATE) + + def restore_from_checkpoint(self, session, inception_checkpoint_file, + trained_checkpoint_file): + """To restore model variables from the checkpoint file. + + The graph is assumed to consist of an inception model and other + layers including a softmax and a fully connected layer. The former is + pre-trained and the latter is trained using the pre-processed data. So + we restore this from two checkpoint files. + Args: + session: The session to be used for restoring from checkpoint. + inception_checkpoint_file: Path to the checkpoint file for the Inception + graph. + trained_checkpoint_file: path to the trained checkpoint for the other + layers. + """ + inception_exclude_scopes = [ + 'InceptionV3/AuxLogits', 'InceptionV3/Logits', 'global_step', + 'final_ops' + ] + reader = tf.train.NewCheckpointReader(inception_checkpoint_file) + var_to_shape_map = reader.get_variable_to_shape_map() + + # Get all variables to restore. Exclude Logits and AuxLogits because they + # depend on the input data and we do not need to intialize them. + all_vars = tf.contrib.slim.get_variables_to_restore( + exclude=inception_exclude_scopes) + # Remove variables that do not exist in the inception checkpoint (for + # example the final softmax and fully-connected layers). + inception_vars = { + var.op.name: var + for var in all_vars if var.op.name in var_to_shape_map + } + inception_saver = tf.train.Saver(inception_vars) + inception_saver.restore(session, inception_checkpoint_file) + + # Restore the rest of the variables from the trained checkpoint. + trained_vars = tf.contrib.slim.get_variables_to_restore( + exclude=inception_exclude_scopes + inception_vars.keys()) + trained_saver = tf.train.Saver(trained_vars) + trained_saver.restore(session, trained_checkpoint_file) + + def build_prediction_graph(self): + """Builds prediction graph and registers appropriate endpoints.""" + + tensors = self.build_graph(None, 1, GraphMod.PREDICT) + + keys_placeholder = tf.placeholder(tf.string, shape=[None]) + inputs = { + 'key': keys_placeholder.name, + 'image_bytes': tensors.input_jpeg.name + } + + tf.add_to_collection('inputs', json.dumps(inputs)) + + # To extract the id, we need to add the identity function. + keys = tf.identity(keys_placeholder) + labels = self.labels + ['UNKNOWN'] + labels_tensor = tf.constant(labels) + labels_table = tf.contrib.lookup.index_to_string_table_from_tensor(mapping=labels_tensor) + predicted_label = labels_table.lookup(tensors.predictions[0]) + + # Need to duplicate the labels by num_of_instances so the output is one batch + # (all output members share the same outer dimension). + # The labels are needed for client to match class scores list. + labels_tensor = tf.expand_dims(tf.constant(labels), 0) + num_instance = tf.shape(keys) + labels_tensors_n = tf.tile(labels_tensor, tf.concat(axis=0, values=[num_instance, [1]])) + + outputs = { + 'key': keys.name, + 'prediction': predicted_label.name, + 'labels': labels_tensors_n.name, + 'scores': tensors.predictions[1].name, + } + tf.add_to_collection('outputs', json.dumps(outputs)) + # Add table init op to collection so online prediction will load the model and run it. + # TODO: initialize_all_tables is going to be deprecated but the replacement + # tf.tables_initializer does not exist in 0.12 yet. + init_tables_op = tf.tables_initializer() + tf.add_to_collection(tf.contrib.session_bundle.constants.INIT_OP_KEY, init_tables_op) + + def export(self, last_checkpoint, output_dir): + """Builds a prediction graph and xports the model. + + Args: + last_checkpoint: Path to the latest checkpoint file from training. + output_dir: Path to the folder to be used to output the model. + """ + logging.info('Exporting prediction graph to %s', output_dir) + with tf.Session(graph=tf.Graph()) as sess: + # Build and save prediction meta graph and trained variable values. + self.build_prediction_graph() + init_op = tf.global_variables_initializer() + sess.run(init_op) + self.restore_from_checkpoint(sess, self.inception_checkpoint_file, + last_checkpoint) + saver = tf.train.Saver() + saver.export_meta_graph(filename=os.path.join(output_dir, 'export.meta')) + saver.save(sess, os.path.join(output_dir, 'export'), write_meta_graph=False) + + def format_metric_values(self, metric_values): + """Formats metric values - used for logging purpose.""" + + # Early in training, metric_values may actually be None. + loss_str = 'N/A' + accuracy_str = 'N/A' + try: + loss_str = 'loss: %.3f' % metric_values[0] + accuracy_str = 'accuracy: %.3f' % metric_values[1] + except (TypeError, IndexError): + pass + + return '%s, %s' % (loss_str, accuracy_str) + + def format_prediction_values(self, prediction): + """Formats prediction values - used for writing batch predictions as csv.""" + return '%.3f' % (prediction[0]) + + +def loss(logits, labels): + """Calculates the loss from the logits and the labels. + + Args: + logits: Logits tensor, float - [batch_size, NUM_CLASSES]. + labels: Labels tensor, int32 - [batch_size]. + Returns: + loss: Loss tensor of type float. + """ + labels = tf.to_int64(labels) + cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( + logits=logits, labels=labels, name='xentropy') + return tf.reduce_mean(cross_entropy, name='xentropy_mean') + + +def training(loss_op): + """Calculates the loss from the logits and the labels. + + Args: + logits: Logits tensor, float - [batch_size, NUM_CLASSES]. + labels: Labels tensor, int32 - [batch_size]. + Returns: + loss: Loss tensor of type float. + """ + global_step = tf.Variable(0, name='global_step', trainable=False) + with tf.name_scope('train'): + optimizer = tf.train.AdamOptimizer(epsilon=0.001) + train_op = optimizer.minimize(loss_op, global_step) + return train_op, global_step diff --git a/solutionbox/inception/datalab_solutions/inception/_package.py b/solutionbox/inception/datalab_solutions/inception/_package.py new file mode 100644 index 000000000..cf2862cae --- /dev/null +++ b/solutionbox/inception/datalab_solutions/inception/_package.py @@ -0,0 +1,243 @@ +# Copyright 2017 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +"""Provides interface for Datalab. It provides: + local_preprocess + local_train + local_predict + cloud_preprocess + cloud_train + cloud_predict + Datalab will look for functions with the above names. +""" + +import logging +import os +import urllib + +from . import _cloud +from . import _local +from . import _model +from . import _preprocess +from . import _trainer +from . import _util + + +def local_preprocess(train_dataset, output_dir, checkpoint=None, eval_dataset=None): + """Preprocess data locally. Produce output that can be used by training efficiently. + Args: + train_dataset: training data source to preprocess. Can be CsvDataset or BigQueryDataSet. + If eval_dataset is None, the pipeline will randomly split train_dataset into + train/eval set with 7:3 ratio. + output_dir: The output directory to use. Preprocessing will create a sub directory under + it for each run, and also update "latest" file which points to the latest preprocessed + directory. Users are responsible for cleanup. Can be local or GCS path. + checkpoint: the Inception checkpoint to use. + eval_dataset: evaluation data source to preprocess. Can be CsvDataset or BigQueryDataSet. + If specified, it will be used for evaluation during training, and train_dataset will be + completely used for training. + """ + + print 'Local preprocessing...' + # TODO: Move this to a new process to avoid pickling issues + # TODO: Expose train/eval split ratio + _local.Local(checkpoint).preprocess(train_dataset, eval_dataset, output_dir) + print 'Done' + + +def cloud_preprocess(train_dataset, output_dir, checkpoint=None, pipeline_option=None, + eval_dataset=None): + """Preprocess data in Cloud with DataFlow. + Produce output that can be used by training efficiently. + Args: + train_dataset: training data source to preprocess. Can be CsvDataset or BigQueryDataSet. + For CsvDataSet, all files must be in GCS. + If eval_dataset is None, the pipeline will randomly split train_dataset into + train/eval set with 7:3 ratio. + output_dir: The output directory to use. Preprocessing will create a sub directory under + it for each run, and also update "latest" file which points to the latest preprocessed + directory. Users are responsible for cleanup. GCS path only. + checkpoint: the Inception checkpoint to use. + pipeline_option: DataFlow pipeline options in a dictionary. + eval_dataset: evaluation data source to preprocess. Can be CsvDataset or BigQueryDataSet. + If specified, it will be used for evaluation during training, and train_dataset will be + completely used for training. + """ + + job_name = _cloud.Cloud(checkpoint=checkpoint).preprocess(train_dataset, eval_dataset, + output_dir, pipeline_option) + if (_util.is_in_IPython()): + import IPython + + dataflow_url = 'https://console.developers.google.com/dataflow?project=%s' % \ + _util.default_project() + html = 'Job "%s" submitted.' % job_name + html += '

Click here to track preprocessing job.
' \ + % dataflow_url + IPython.display.display_html(html, raw=True) + + +def local_train(input_dir, batch_size, max_steps, output_dir, checkpoint=None): + """Train model locally. The output can be used for local prediction or for online deployment. + Args: + input_dir: A directory path containing preprocessed results. Can be local or GCS path. + batch_size: size of batch used for training. + max_steps: number of steps to train. + output_dir: The output directory to use. Can be local or GCS path. + checkpoint: the Inception checkpoint to use. + """ + + logger = logging.getLogger() + original_level = logger.getEffectiveLevel() + logger.setLevel(logging.INFO) + print 'Local training...' + try: + _local.Local(checkpoint).train(input_dir, batch_size, max_steps, output_dir) + finally: + logger.setLevel(original_level) + print 'Done' + + +def cloud_train(input_dir, batch_size, max_steps, output_dir, + cloud_train_config, checkpoint=None): + """Train model in the cloud with CloudML trainer service. + The output can be used for local prediction or for online deployment. + Args: + input_dir: A directory path containing preprocessed results. GCS path only. + batch_size: size of batch used for training. + max_steps: number of steps to train. + output_dir: The output directory to use. GCS path only. + cloud_train_config: a datalab.ml.CloudTrainingConfig object. + checkpoint: the Inception checkpoint to use. + """ + + job = _cloud.Cloud(checkpoint=checkpoint).train(input_dir, batch_size, + max_steps, output_dir, cloud_train_config) + if (_util.is_in_IPython()): + import IPython + log_url_query_strings = { + 'project': _util.default_project(), + 'resource': 'ml.googleapis.com/job_id/' + job.info['jobId'] + } + log_url = 'https://console.developers.google.com/logs/viewer?' + \ + urllib.urlencode(log_url_query_strings) + html = 'Job "%s" submitted.' % job.info['jobId'] + html += '

Click here to view cloud log.
' % log_url + IPython.display.display_html(html, raw=True) + + +def _display_predict_results(results, show_image): + if (_util.is_in_IPython()): + import IPython + for image_url, image, label_and_score in results: + if show_image is True: + IPython.display.display_html('

%s(%.5f)

' % label_and_score, + raw=True) + IPython.display.display(IPython.display.Image(data=image)) + else: + IPython.display.display_html( + '

%s    %s(%.5f)

' % ((image_url,) + label_and_score), raw=True) + else: + print results + + +def local_predict(model_dir, image_files, resize=False, show_image=True): + """Predict using an offline model. + Args: + model_dir: The directory of a trained inception model. Can be local or GCS paths. + image_files: The paths to the image files to predict labels. Can be local or GCS paths. + show_image: Whether to show images in the results. + resize: Whether to resize the image to a reasonable size (300x300) before prediction. + """ + print('Predicting...') + images = _util.load_images(image_files, resize=resize) + labels_and_scores = _local.Local().predict(model_dir, images) + results = zip(image_files, images, labels_and_scores) + _display_predict_results(results, show_image) + print('Done') + + +def cloud_predict(model_id, image_files, resize=False, show_image=True): + """Predict using a deployed (online) model. + Args: + model_id: The deployed model id in the form of "model.version". + image_files: The paths to the image files to predict labels. GCS paths only. + show_image: Whether to show images in the results. + resize: Whether to resize the image to a reasonable size (300x300) before prediction. + Set it to True if your images are too large to send over network. + """ + print('Predicting...') + images = _util.load_images(image_files, resize=resize) + labels_and_scores = _cloud.Cloud().predict(model_id, images) + results = zip(image_files, images, labels_and_scores) + _display_predict_results(results, show_image) + print('Done') + + +def local_batch_predict(dataset, model_dir, output_csv=None, output_bq_table=None): + """Batch predict running locally. + Args: + dataset: CsvDataSet or BigQueryDataSet for batch prediction input. Can contain either + one column 'image_url', or two columns with another being 'label'. + model_dir: The directory of a trained inception model. Can be local or GCS paths. + output_csv: The output csv file for prediction results. If specified, + it will also output a csv schema file with the name output_csv + '.schema.json'. + output_bq_table: if specified, the output BigQuery table for prediction results. + output_csv and output_bq_table can both be set. + Raises: + ValueError if both output_csv and output_bq_table are None. + """ + + if output_csv is None and output_bq_table is None: + raise ValueError('output_csv and output_bq_table cannot both be None.') + + print('Predicting...') + _local.Local().batch_predict(dataset, model_dir, output_csv, output_bq_table) + print('Done') + + +def cloud_batch_predict(dataset, model_dir, gcs_staging_location, + output_csv=None, output_bq_table=None, pipeline_option=None): + """Batch predict running in cloud. + + Args: + dataset: CsvDataSet or BigQueryDataSet for batch prediction input. Can contain either + one column 'image_url', or two columns with another being 'label'. + model_dir: A GCS path to a trained inception model directory. + gcs_staging_location: A temporary location for DataFlow staging. + output_csv: If specified, prediction results will be saved to the specified Csv file. + It will also output a csv schema file with the name output_csv + '.schema.json'. + GCS file path only. + output_bq_table: If specified, prediction results will be saved to the specified BigQuery + table. output_csv and output_bq_table can both be set, but cannot be both None. + pipeline_option: DataFlow pipeline options in a dictionary. + Raises: + ValueError if both output_csv and output_bq_table are None. + """ + + if output_csv is None and output_bq_table is None: + raise ValueError('output_csv and output_bq_table cannot both be None.') + + job_name = _cloud.Cloud().batch_predict(dataset, model_dir, + gcs_staging_location, output_csv, output_bq_table, pipeline_option) + if (_util.is_in_IPython()): + import IPython + + dataflow_url = ('https://console.developers.google.com/dataflow?project=%s' % + _util.default_project()) + html = 'Job "%s" submitted.' % job_name + html += ('

Click here to track batch prediction job.
' + % dataflow_url) + IPython.display.display_html(html, raw=True) diff --git a/solutionbox/inception/datalab_solutions/inception/_predictor.py b/solutionbox/inception/datalab_solutions/inception/_predictor.py new file mode 100644 index 000000000..03f3974f6 --- /dev/null +++ b/solutionbox/inception/datalab_solutions/inception/_predictor.py @@ -0,0 +1,226 @@ +# Copyright 2017 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +"""Local implementation for preprocessing, training and prediction for inception model. +""" + +import apache_beam as beam +import collections +import json +import os +import tensorflow as tf + +from . import _model +from . import _util + + +def _tf_predict(model_dir, images): + model_dir = os.path.join(model_dir, 'model') + with tf.Session() as sess: + new_saver = tf.train.import_meta_graph(os.path.join(model_dir, 'export.meta')) + new_saver.restore(sess, os.path.join(model_dir, 'export')) + init_op = tf.get_collection(tf.contrib.session_bundle.constants.INIT_OP_KEY)[0] + sess.run(init_op) + inputs = json.loads(tf.get_collection('inputs')[0]) + outputs = json.loads(tf.get_collection('outputs')[0]) + feed_dict = collections.defaultdict(list) + for ii, image in enumerate(images): + feed_dict[inputs['image_bytes']].append(image) + feed_dict[inputs['key']].append(str(ii)) + predictions, labels, scores = sess.run( + [outputs['prediction'], outputs['labels'], outputs['scores']], feed_dict=feed_dict) + return zip(predictions, labels, scores) + + +def predict(model_dir, images): + """Local instant prediction.""" + + results = _tf_predict(model_dir, images) + predicted_and_scores = [(predicted, label_scores[list(labels).index(predicted)]) + for predicted, labels, label_scores in results] + return predicted_and_scores + + +# Helpers for batch prediction dataflow pipeline + +class EmitAsBatchDoFn(beam.DoFn): + """A DoFn that buffers the records and emits them batch by batch.""" + + def __init__(self, batch_size): + self._batch_size = batch_size + self._cached = [] + + def process(self, element): + self._cached.append(element) + if len(self._cached) >= self._batch_size: + emit = self._cached + self._cached = [] + yield emit + + def finish_bundle(self, context=None): + if len(self._cached) > 0: # pylint: disable=g-explicit-length-test + yield self._cached + + +class UnbatchDoFn(beam.DoFn): + """A DoFn expand batch into elements.""" + + def process(self, element): + for item in element: + yield item + + +class LoadImagesDoFn(beam.DoFn): + """A DoFn that reads image from url.""" + + def process(self, element): + with _util.open_local_or_gcs(element['image_url'], 'r') as ff: + image_bytes = ff.read() + out_element = {'image_bytes': image_bytes} + out_element.update(element) + yield out_element + + +class PredictBatchDoFn(beam.DoFn): + """A DoFn that does batch prediction.""" + + def __init__(self, model_dir): + import os + + self._model_dir = os.path.join(model_dir, 'model') + self._session = None + self._tf_inputs = None + self._tf_outputs = None + + def start_bundle(self, context=None): + import json + import os + import tensorflow as tf + + self._session = tf.Session() + new_saver = tf.train.import_meta_graph(os.path.join(self._model_dir, 'export.meta')) + new_saver.restore(self._session, os.path.join(self._model_dir, 'export')) + init_op = tf.get_collection(tf.contrib.session_bundle.constants.INIT_OP_KEY)[0] + self._session.run(init_op) + self._tf_inputs = json.loads(tf.get_collection('inputs')[0]) + self._tf_outputs = json.loads(tf.get_collection('outputs')[0]) + + def finish_bundle(self, context=None): + if self._session is not None: + self._session.close() + + def process(self, element): + import collections + + image_urls = [x['image_url'] for x in element] + targets = None + if 'label' in element[0] and element[0]['label'] is not None: + targets = [x['label'] for x in element] + + feed_dict = collections.defaultdict(list) + feed_dict[self._tf_inputs['image_bytes']] = [x['image_bytes'] for x in element] + feed_dict[self._tf_inputs['key']] = image_urls + predictions, labels, scores = self._session.run( + [self._tf_outputs['prediction'], self._tf_outputs['labels'], self._tf_outputs['scores']], + feed_dict=feed_dict) + if targets is not None: + yield zip(image_urls, targets, predictions, labels, scores) + else: + yield zip(image_urls, predictions, labels, scores) + + +class ProcessResultsDoFn(beam.DoFn): + """A DoFn that process prediction results by casting values and calculating + target_prob. + """ + + def process(self, element): + target = None + if len(element) == 5: + image_url, target, prediction, labels, scores = element + else: + image_url, prediction, labels, scores = element + labels = list(labels) + predicted_prob = scores[labels.index(prediction)] + out_element = { + 'image_url': image_url, + 'predicted': prediction, + # Convert to float from np.float32 because BigQuery Sink can only handle intrinsic types. + 'predicted_prob': float(predicted_prob) + } + if target is not None: + target_prob = scores[labels.index(target)] if target in labels else 0.0 + out_element['target_prob'] = float(target_prob) + out_element['target'] = target + yield out_element + + +class MakeCsvLineDoFn(beam.DoFn): + """A DoFn that makes CSV lines out of prediction results.""" + + def process(self, element): + import csv + import StringIO + + line = StringIO.StringIO() + if len(element) == 5: + csv.DictWriter(line, + ['image_url', 'target', 'predicted', 'target_prob', 'predicted_prob']).writerow(element) + else: + csv.DictWriter(line, ['image_url', 'predicted', 'predicted_prob']).writerow(element) + yield line.getvalue() + + +def configure_pipeline(p, dataset, model_dir, output_csv, output_bq_table): + """Configures a dataflow pipeline for batch prediction.""" + + data = _util.get_sources_from_dataset(p, dataset, 'predict') + if len(dataset.schema) == 2: + output_schema = [ + {'name': 'image_url', 'type': 'STRING'}, + {'name': 'target', 'type': 'STRING'}, + {'name': 'predicted', 'type': 'STRING'}, + {'name': 'target_prob', 'type': 'FLOAT'}, + {'name': 'predicted_prob', 'type': 'FLOAT'}, + ] + else: + output_schema = [ + {'name': 'image_url', 'type': 'STRING'}, + {'name': 'predicted', 'type': 'STRING'}, + {'name': 'predicted_prob', 'type': 'FLOAT'}, + ] + results = (data + | 'Load Images' >> beam.ParDo(LoadImagesDoFn()) + | 'Batch Inputs' >> beam.ParDo(EmitAsBatchDoFn(20)) + | 'Batch Predict' >> beam.ParDo(PredictBatchDoFn(model_dir)) + | 'Unbatch' >> beam.ParDo(UnbatchDoFn()) + | 'Process Results' >> beam.ParDo(ProcessResultsDoFn())) + + if output_csv is not None: + schema_file = output_csv + '.schema.json' + results_save = (results + | 'Prepare For Output' >> beam.ParDo(MakeCsvLineDoFn()) + | 'Write Csv Results' >> beam.io.textio.WriteToText(output_csv, shard_name_template='')) + (results_save + | beam.transforms.combiners.Sample.FixedSizeGlobally('Sample One', 1) + | 'Serialize Schema' >> beam.Map(lambda path: json.dumps(output_schema)) + | 'Write Schema' >> beam.io.textio.WriteToText(schema_file, shard_name_template='')) + if output_bq_table is not None: + # BigQuery sink takes schema in the form of 'field1:type1,field2:type2...' + bq_schema_string = ','.join(x['name'] + ':' + x['type'] for x in output_schema) + sink = beam.io.BigQuerySink(output_bq_table, schema=bq_schema_string, + write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE) + results | 'Write BQ Results' >> beam.io.Write(sink) + diff --git a/solutionbox/inception/datalab_solutions/inception/_preprocess.py b/solutionbox/inception/datalab_solutions/inception/_preprocess.py new file mode 100644 index 000000000..a79fed671 --- /dev/null +++ b/solutionbox/inception/datalab_solutions/inception/_preprocess.py @@ -0,0 +1,363 @@ +# Copyright 2017 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +"""Preprocess pipeline implementation with Cloud DataFlow. +""" + + +import apache_beam as beam +from apache_beam.io import fileio +from apache_beam.io import tfrecordio +from apache_beam.metrics import Metrics +from apache_beam.utils.pipeline_options import PipelineOptions +import cStringIO +import csv +import logging +import os +from PIL import Image +import tensorflow as tf + +from . import _inceptionlib +from . import _util + + +slim = tf.contrib.slim + +error_count = Metrics.counter('main', 'errorCount') +rows_count = Metrics.counter('main', 'rowsCount') +skipped_empty_line = Metrics.counter('main', 'skippedEmptyLine') +embedding_good = Metrics.counter('main', 'embedding_good') +embedding_bad = Metrics.counter('main', 'embedding_bad') +incompatible_image = Metrics.counter('main', 'incompatible_image') +invalid_uri = Metrics.counter('main', 'invalid_file_name') +unlabeled_image = Metrics.counter('main', 'unlabeled_image') + + +class ExtractLabelIdsDoFn(beam.DoFn): + """Extracts (uri, label_ids) tuples from CSV rows. + """ + + def start_bundle(self, context=None): + self.label_to_id_map = {} + + def process(self, element, all_labels): + all_labels = list(all_labels) + # DataFlow cannot garuantee the order of the labels when materializing it. + # The labels materialized and consumed by training may not be with the same order + # as the one used in preprocessing. So we need to sort it in both preprocessing + # and training so the order matches. + all_labels.sort() + if not self.label_to_id_map: + for i, label in enumerate(all_labels): + label = label.strip() + if label: + self.label_to_id_map[label] = i + + # Row format is: + # image_uri,label_id + if not element: + skipped_empty_line.inc() + return + + rows_count.inc() + uri = element['image_url'] + if not uri or not uri.startswith('gs://'): + invalid_uri.inc() + return + + try: + label_id = self.label_to_id_map[element['label'].strip()] + except KeyError: + unlabeled_image.inc() + yield uri, label_id + + +class ReadImageAndConvertToJpegDoFn(beam.DoFn): + """Read files from GCS and convert images to JPEG format. + + We do this even for JPEG images to remove variations such as different number + of channels. + """ + + def process(self, element): + uri, label_id = element + + try: + with _util.open_local_or_gcs(uri, mode='r') as f: + img = Image.open(f).convert('RGB') + # A variety of different calling libraries throw different exceptions here. + # They all correspond to an unreadable file so we treat them equivalently. + # pylint: disable broad-except + except Exception as e: + logging.exception('Error processing image %s: %s', uri, str(e)) + error_count.inc() + return + + # Convert to desired format and output. + output = cStringIO.StringIO() + img.save(output, 'jpeg') + image_bytes = output.getvalue() + yield uri, label_id, image_bytes + + +class EmbeddingsGraph(object): + """Builds a graph and uses it to extract embeddings from images. + """ + + # These constants are set by Inception v3's expectations. + WIDTH = 299 + HEIGHT = 299 + CHANNELS = 3 + + def __init__(self, tf_session, checkpoint_path): + self.tf_session = tf_session + # input_jpeg is the tensor that contains raw image bytes. + # It is used to feed image bytes and obtain embeddings. + self.input_jpeg, self.embedding = self.build_graph() + self.tf_session.run(tf.global_variables_initializer()) + self.restore_from_checkpoint(checkpoint_path) + + def build_graph(self): + """Forms the core by building a wrapper around the inception graph. + + Here we add the necessary input & output tensors, to decode jpegs, + serialize embeddings, restore from checkpoint etc. + + To use other Inception models modify this file. Note that to use other + models beside Inception, you should make sure input_shape matches + their input. Resizing or other modifications may be necessary as well. + See tensorflow/contrib/slim/python/slim/nets/inception_v3.py for + details about InceptionV3. + + Returns: + input_jpeg: A tensor containing raw image bytes as the input layer. + embedding: The embeddings tensor, that will be materialized later. + """ + + input_jpeg = tf.placeholder(tf.string, shape=None) + image = tf.image.decode_jpeg(input_jpeg, channels=self.CHANNELS) + + # Note resize expects a batch_size, but we are feeding a single image. + # So we have to expand then squeeze. Resize returns float32 in the + # range [0, uint8_max] + image = tf.expand_dims(image, 0) + + # convert_image_dtype also scales [0, uint8_max] -> [0 ,1). + image = tf.image.convert_image_dtype(image, dtype=tf.float32) + image = tf.image.resize_bilinear( + image, [self.HEIGHT, self.WIDTH], align_corners=False) + + # Then rescale range to [-1, 1) for Inception. + image = tf.subtract(image, 0.5) + inception_input = tf.multiply(image, 2.0) + + # Build Inception layers, which expect a tensor of type float from [-1, 1) + # and shape [batch_size, height, width, channels]. + with slim.arg_scope(_inceptionlib.inception_v3_arg_scope()): + _, end_points = _inceptionlib.inception_v3(inception_input, is_training=False) + + embedding = end_points['PreLogits'] + return input_jpeg, embedding + + def restore_from_checkpoint(self, checkpoint_path): + """To restore inception model variables from the checkpoint file. + + Some variables might be missing in the checkpoint file, so it only + loads the ones that are avialable, assuming the rest would be + initialized later. + Args: + checkpoint_path: Path to the checkpoint file for the Inception graph. + """ + # Get all variables to restore. Exclude Logits and AuxLogits because they + # depend on the input data and we do not need to intialize them from + # checkpoint. + all_vars = tf.contrib.slim.get_variables_to_restore( + exclude=['InceptionV3/AuxLogits', 'InceptionV3/Logits', 'global_step']) + + saver = tf.train.Saver(all_vars) + saver.restore(self.tf_session, checkpoint_path) + + def calculate_embedding(self, batch_image_bytes): + """Get the embeddings for a given JPEG image. + + Args: + batch_image_bytes: As if returned from [ff.read() for ff in file_list]. + + Returns: + The Inception embeddings (bottleneck layer output) + """ + return self.tf_session.run( + self.embedding, feed_dict={self.input_jpeg: batch_image_bytes}) + + +class TFExampleFromImageDoFn(beam.DoFn): + """Embeds image bytes and labels, stores them in tensorflow.Example. + + (uri, label_ids, image_bytes) -> (tensorflow.Example). + + Output proto contains 'label', 'image_uri' and 'embedding'. + The 'embedding' is calculated by feeding image into input layer of image + neural network and reading output of the bottleneck layer of the network. + + Attributes: + image_graph_uri: an uri to gcs bucket where serialized image graph is + stored. + """ + + def __init__(self, checkpoint_path): + self.tf_session = None + self.graph = None + self.preprocess_graph = None + self._checkpoint_path = checkpoint_path + + def start_bundle(self, context=None): + # There is one tensorflow session per instance of TFExampleFromImageDoFn. + # The same instance of session is re-used between bundles. + # Session is closed by the destructor of Session object, which is called + # when instance of TFExampleFromImageDoFn() is destructed. + if not self.graph: + self.graph = tf.Graph() + self.tf_session = tf.InteractiveSession(graph=self.graph) + with self.graph.as_default(): + self.preprocess_graph = EmbeddingsGraph(self.tf_session, self._checkpoint_path) + + def finish_bundle(self, context=None): + if self.tf_session is not None: + self.tf_session.close() + + def process(self, element): + + def _bytes_feature(value): + return tf.train.Feature(bytes_list=tf.train.BytesList(value=value)) + + def _float_feature(value): + return tf.train.Feature(float_list=tf.train.FloatList(value=value)) + + uri, label_id, image_bytes = element + + try: + embedding = self.preprocess_graph.calculate_embedding(image_bytes) + except tf.errors.InvalidArgumentError as e: + incompatible_image.inc() + logging.warning('Could not encode an image from %s: %s', uri, str(e)) + return + + if embedding.any(): + embedding_good.inc() + else: + embedding_bad.inc() + + example = tf.train.Example(features=tf.train.Features(feature={ + 'image_uri': _bytes_feature([str(uri)]), + 'embedding': _float_feature(embedding.ravel().tolist()), + })) + + example.features.feature['label'].int64_list.value.append(label_id) + + yield example + + +class TrainEvalSplitPartitionFn(beam.PartitionFn): + """Split train and eval data.""" + def partition_for(self, element, num_partitions): + import random + return 1 if random.random() > 0.7 else 0 + + +class ExampleProtoCoder(beam.coders.Coder): + """A coder to encode and decode TensorFlow Example objects.""" + + def __init__(self): + import tensorflow as tf # pylint: disable=g-import-not-at-top + self._tf_train = tf.train + + def encode(self, example_proto): + return example_proto.SerializeToString() + + def decode(self, serialized_str): + example = self._tf_train.Example() + example.ParseFromString(serialized_str) + return example + + +class SaveFeatures(beam.PTransform): + """Save Features in a TFRecordIO format. + """ + + def __init__(self, file_path_prefix): + super(SaveFeatures, self).__init__('SaveFeatures') + self._file_path_prefix = file_path_prefix + + def expand(self, features): + return (features + | 'Write to %s' % self._file_path_prefix.replace('/', '_') + >> tfrecordio.WriteToTFRecord( + file_path_prefix=self._file_path_prefix, + file_name_suffix='.tfrecord.gz', + shard_name_template=fileio.DEFAULT_SHARD_NAME_TEMPLATE, + coder=ExampleProtoCoder(), + compression_type=fileio.CompressionTypes.AUTO)) + + +def _labels_pipeline(sources): + labels = (sources + | 'Flatten Sources for labels' >> beam.Flatten() + | 'Parse input for labels' >> beam.Map(lambda x: str(x['label'])) + | 'Combine labels' >> beam.transforms.combiners.Count.PerElement() + | 'Get labels' >> beam.Map(lambda label_count: label_count[0])) + return labels + + +def _transformation_pipeline(source, checkpoint, labels, mode): + transformed = (source + | 'Extract label ids(%s)' % mode >> beam.ParDo(ExtractLabelIdsDoFn(), + beam.pvalue.AsIter(labels)) + | 'Read and convert to JPEG(%s)' % mode >> beam.ParDo(ReadImageAndConvertToJpegDoFn()) + | 'Embed and make TFExample(%s)' % mode >> + beam.ParDo(TFExampleFromImageDoFn(checkpoint))) + return transformed + + +def configure_pipeline(p, dataset_train, dataset_eval, checkpoint_path, output_dir, job_id): + source_train = _util.get_sources_from_dataset(p, dataset_train, 'train') + labels_source = [source_train] + if dataset_eval is not None: + source_eval = _util.get_sources_from_dataset(p, dataset_eval, 'eval') + labels_source.append(source_eval) + + labels = _labels_pipeline(labels_source) + train_preprocessed = _transformation_pipeline(source_train, checkpoint_path, labels, 'train') + if dataset_eval is not None: + # explicit eval data. + eval_preprocessed = _transformation_pipeline(source_eval, checkpoint_path, labels, 'eval') + else: + # Split train/eval. + train_preprocessed, eval_preprocessed = (train_preprocessed | + 'Random Partition' >> beam.Partition(TrainEvalSplitPartitionFn(), 2)) + + output_train_path = os.path.join(output_dir, job_id, 'train') + output_eval_path = os.path.join(output_dir, job_id, 'eval') + labels_file = os.path.join(output_dir, job_id, 'labels') + labels_save = (labels + | 'Write labels' >> beam.io.textio.WriteToText(labels_file, shard_name_template='')) + train_save = train_preprocessed | 'Save train to disk' >> SaveFeatures(output_train_path) + eval_save = eval_preprocessed | 'Save eval to disk' >> SaveFeatures(output_eval_path) + # Make sure we write "latest" file after train and eval data are successfully written. + output_latest_file = os.path.join(output_dir, 'latest') + ([eval_save, train_save, labels_save] | 'Wait for train eval saving' >> beam.Flatten() | + beam.transforms.combiners.Sample.FixedSizeGlobally('Fixed One', 1) | + beam.Map(lambda path: job_id) | + 'WriteLatest' >> beam.io.textio.WriteToText(output_latest_file, shard_name_template='')) + diff --git a/solutionbox/inception/datalab_solutions/inception/_trainer.py b/solutionbox/inception/datalab_solutions/inception/_trainer.py new file mode 100644 index 000000000..8003ee0ba --- /dev/null +++ b/solutionbox/inception/datalab_solutions/inception/_trainer.py @@ -0,0 +1,274 @@ +# Copyright 2017 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +"""Training implementation for inception model. +""" + +import logging +import os +import tensorflow as tf +import time + +from . import _util + + +def start_server(cluster, task): + if not task.type: + raise ValueError('--task_type must be specified.') + if task.index is None: + raise ValueError('--task_index must be specified.') + + # Create and start a server. + return tf.train.Server( + tf.train.ClusterSpec(cluster), + protocol='grpc', + job_name=task.type, + task_index=task.index) + +class Evaluator(object): + """Loads variables from latest checkpoint and performs model evaluation.""" + + def __init__(self, model, data_paths, batch_size, output_path, dataset='eval'): + data_size = self._data_size(data_paths) + if data_size <= batch_size: + raise Exception('Data size is smaller than batch size.') + self.num_eval_batches = data_size // batch_size + self.batch_of_examples = [] + self.checkpoint_path = os.path.join(output_path, 'train') + self.output_path = os.path.join(output_path, dataset) + self.eval_data_paths = data_paths + self.batch_size = batch_size + self.model = model + + + def _data_size(self, data_paths): + n = 0 + options = tf.python_io.TFRecordOptions( + compression_type=tf.python_io.TFRecordCompressionType.GZIP) + for file in data_paths: + for line in tf.python_io.tf_record_iterator(file, options=options): + n += 1 + return n + + def evaluate(self, num_eval_batches=None): + """Run one round of evaluation, return loss and accuracy.""" + + num_eval_batches = num_eval_batches or self.num_eval_batches + with tf.Graph().as_default() as graph: + self.tensors = self.model.build_eval_graph(self.eval_data_paths, + self.batch_size) + self.summary = tf.summary.merge_all() + self.saver = tf.train.Saver() + + self.summary_writer = tf.summary.FileWriter(self.output_path) + self.sv = tf.train.Supervisor( + graph=graph, + logdir=self.output_path, + summary_op=None, + global_step=None, + saver=self.saver) + + last_checkpoint = tf.train.latest_checkpoint(self.checkpoint_path) + with self.sv.managed_session( + master='', start_standard_services=False) as session: + self.sv.saver.restore(session, last_checkpoint) + + if not self.batch_of_examples: + self.sv.start_queue_runners(session) + for i in range(num_eval_batches): + self.batch_of_examples.append(session.run(self.tensors.examples)) + + for i in range(num_eval_batches): + session.run(self.tensors.metric_updates, + {self.tensors.examples: self.batch_of_examples[i]}) + + metric_values = session.run(self.tensors.metric_values) + global_step = tf.train.global_step(session, self.tensors.global_step) + summary = session.run(self.summary) + self.summary_writer.add_summary(summary, global_step) + self.summary_writer.flush() + return metric_values + + + +class Trainer(object): + """Performs model training and optionally evaluation.""" + + def __init__(self, input_dir, batch_size, max_steps, output_path, model, cluster, task): + train_files, eval_files = _util.get_train_eval_files(input_dir) + self.train_data_paths = train_files + self.output_path = output_path + self.batch_size = batch_size + self.model = model + self.max_steps = max_steps + self.cluster = cluster + self.task = task + self.evaluator = Evaluator(self.model, eval_files, batch_size, output_path, 'eval_set') + self.train_evaluator = Evaluator(self.model, train_files, batch_size, output_path, 'train_set') + self.min_train_eval_rate = 8 + + def run_training(self): + """Runs a Master.""" + self.train_path = os.path.join(self.output_path, 'train') + self.model_path = os.path.join(self.output_path, 'model') + self.is_master = self.task.type != 'worker' + log_interval = 15 + self.eval_interval = 30 + if self.is_master and self.task.index > 0: + raise StandardError('Only one replica of master expected') + + if self.cluster: + logging.info('Starting %s/%d', self.task.type, self.task.index) + server = start_server(self.cluster, self.task) + target = server.target + device_fn = tf.train.replica_device_setter( + ps_device='/job:ps', + worker_device='/job:%s/task:%d' % (self.task.type, self.task.index), + cluster=self.cluster) + # We use a device_filter to limit the communication between this job + # and the parameter servers, i.e., there is no need to directly + # communicate with the other workers; attempting to do so can result + # in reliability problems. + device_filters = [ + '/job:ps', '/job:%s/task:%d' % (self.task.type, self.task.index) + ] + config = tf.ConfigProto(device_filters=device_filters) + else: + target = '' + device_fn = '' + config = None + + with tf.Graph().as_default() as graph: + with tf.device(device_fn): + # Build the training graph. + self.tensors = self.model.build_train_graph(self.train_data_paths, + self.batch_size) + + # Add the variable initializer Op. + init_op = tf.global_variables_initializer() + + # Create a saver for writing training checkpoints. + self.saver = tf.train.Saver() + + # Build the summary operation based on the TF collection of Summaries. + self.summary_op = tf.summary.merge_all() + + # Create a "supervisor", which oversees the training process. + self.sv = tf.train.Supervisor( + graph, + is_chief=self.is_master, + logdir=self.train_path, + init_op=init_op, + saver=self.saver, + # Write summary_ops by hand. + summary_op=None, + global_step=self.tensors.global_step, + # No saving; we do it manually in order to easily evaluate immediately + # afterwards. + save_model_secs=0) + + should_retry = True + to_run = [self.tensors.global_step, self.tensors.train] + + while should_retry: + try: + should_retry = False + with self.sv.managed_session(target, config=config) as session: + self.start_time = start_time = time.time() + self.last_save = self.last_log = 0 + self.global_step = self.last_global_step = 0 + self.local_step = self.last_local_step = 0 + self.last_global_time = self.last_local_time = start_time + + # Loop until the supervisor shuts down or max_steps have + # completed. + max_steps = self.max_steps + while not self.sv.should_stop() and self.global_step < max_steps: + try: + # Run one step of the model. + self.global_step = session.run(to_run)[0] + self.local_step += 1 + + self.now = time.time() + is_time_to_eval = (self.now - self.last_save) > self.eval_interval + is_time_to_log = (self.now - self.last_log) > log_interval + should_eval = self.is_master and is_time_to_eval + should_log = is_time_to_log or should_eval + + if should_log: + self.log(session) + + if should_eval: + self.eval(session) + except tf.errors.AbortedError: + should_retry = True + + if self.is_master: + # Take the final checkpoint and compute the final accuracy. + # self.saver.save(session, self.sv.save_path, self.tensors.global_step) + self.eval(session) + + except tf.errors.AbortedError: + print('Hitting an AbortedError. Trying it again.') + should_retry = True + + # Export the model for inference. + if self.is_master: + self.model.export(tf.train.latest_checkpoint(self.train_path), self.model_path) + + # Ask for all the services to stop. + self.sv.stop() + + def log(self, session): + """Logs training progress.""" + logging.info('Train [%s/%d], step %d (%.3f sec) %.1f ' + 'global steps/s, %.1f local steps/s', self.task.type, + self.task.index, self.global_step, + (self.now - self.start_time), + (self.global_step - self.last_global_step) / + (self.now - self.last_global_time), + (self.local_step - self.last_local_step) / + (self.now - self.last_local_time)) + self.last_log = self.now + self.last_global_step, self.last_global_time = self.global_step, self.now + self.last_local_step, self.last_local_time = self.local_step, self.now + + def eval(self, session): + """Runs evaluation loop.""" + eval_start = time.time() + self.saver.save(session, self.sv.save_path, self.tensors.global_step) + logging.info( + 'Eval, step %d:\n- on train set %s\n-- on eval set %s', + self.global_step, + self.model.format_metric_values(self.train_evaluator.evaluate()), + self.model.format_metric_values(self.evaluator.evaluate())) + now = time.time() + + # Make sure eval doesn't consume too much of total time. + eval_time = now - eval_start + train_eval_rate = self.eval_interval / eval_time + if train_eval_rate < self.min_train_eval_rate and self.last_save > 0: + logging.info('Adjusting eval interval from %.2fs to %.2fs', + self.eval_interval, self.min_train_eval_rate * eval_time) + self.eval_interval = self.min_train_eval_rate * eval_time + + self.last_save = now + self.last_log = now + + def save_summaries(self, session): + self.sv.summary_computed(session, + session.run(self.summary_op), self.global_step) + self.sv.summary_writer.flush() + diff --git a/solutionbox/inception/datalab_solutions/inception/_util.py b/solutionbox/inception/datalab_solutions/inception/_util.py new file mode 100644 index 000000000..8e2ad9fa9 --- /dev/null +++ b/solutionbox/inception/datalab_solutions/inception/_util.py @@ -0,0 +1,268 @@ +# Copyright 2017 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +"""Reusable utility functions. +""" + +from apache_beam.io import gcsio +import collections +import glob +import multiprocessing +import os + +import tensorflow as tf +from tensorflow.python.lib.io import file_io + + +_DEFAULT_CHECKPOINT_GSURL = 'gs://cloud-ml-data/img/flower_photos/inception_v3_2016_08_28.ckpt' + + +def is_in_IPython(): + try: + import IPython + return True + except ImportError: + return False + + +def default_project(): + import datalab.context + context = datalab.context.Context.default() + return context.project_id + + +def open_local_or_gcs(path, mode): + """Opens the given path.""" + if path.startswith('gs://'): + try: + return gcsio.GcsIO().open(path, mode) + except Exception as e: # pylint: disable=broad-except + # Currently we retry exactly once, to work around flaky gcs calls. + logging.error('Retrying after exception reading gcs file: %s', e) + time.sleep(10) + return gcsio.GcsIO().open(path, mode) + else: + return open(path, mode) + + +def file_exists(path): + """Returns whether the file exists.""" + if path.startswith('gs://'): + return gcsio.GcsIO().exists(path) + else: + return os.path.exists(path) + + +def glob_files(path): + if path.startswith('gs://'): + return gcsio.GcsIO().glob(path) + else: + return glob.glob(path) + + +def _get_latest_data_dir(input_dir): + latest_file = os.path.join(input_dir, 'latest') + if not file_exists(latest_file): + raise Exception(('Cannot find "latest" file in "%s". ' + + 'Please use a preprocessing output dir.') % input_dir) + with open_local_or_gcs(latest_file, 'r') as f: + dir_name = f.read().rstrip() + return os.path.join(input_dir, dir_name) + + +def get_train_eval_files(input_dir): + """Get preprocessed training and eval files.""" + data_dir = _get_latest_data_dir(input_dir) + train_pattern = os.path.join(data_dir, 'train*.tfrecord.gz') + eval_pattern = os.path.join(data_dir, 'eval*.tfrecord.gz') + train_files = glob_files(train_pattern) + eval_files = glob_files(eval_pattern) + return train_files, eval_files + + +def get_labels(input_dir): + """Get a list of labels from preprocessed output dir.""" + data_dir = _get_latest_data_dir(input_dir) + labels_file = os.path.join(data_dir, 'labels') + with open_local_or_gcs(labels_file, mode='r') as f: + labels = f.read().rstrip().split('\n') + return labels + + +def read_examples(input_files, batch_size, shuffle, num_epochs=None): + """Creates readers and queues for reading example protos.""" + files = [] + for e in input_files: + for path in e.split(','): + files.extend(file_io.get_matching_files(path)) + thread_count = multiprocessing.cpu_count() + + # The minimum number of instances in a queue from which examples are drawn + # randomly. The larger this number, the more randomness at the expense of + # higher memory requirements. + min_after_dequeue = 1000 + + # When batching data, the queue's capacity will be larger than the batch_size + # by some factor. The recommended formula is (num_threads + a small safety + # margin). For now, we use a single thread for reading, so this can be small. + queue_size_multiplier = thread_count + 3 + + # Convert num_epochs == 0 -> num_epochs is None, if necessary + num_epochs = num_epochs or None + + # Build a queue of the filenames to be read. + filename_queue = tf.train.string_input_producer(files, num_epochs, shuffle) + + options = tf.python_io.TFRecordOptions( + compression_type=tf.python_io.TFRecordCompressionType.GZIP) + example_id, encoded_example = tf.TFRecordReader(options=options).read_up_to( + filename_queue, batch_size) + + if shuffle: + capacity = min_after_dequeue + queue_size_multiplier * batch_size + return tf.train.shuffle_batch( + [example_id, encoded_example], + batch_size, + capacity, + min_after_dequeue, + enqueue_many=True, + num_threads=thread_count) + else: + capacity = queue_size_multiplier * batch_size + return tf.train.batch( + [example_id, encoded_example], + batch_size, + capacity=capacity, + enqueue_many=True, + num_threads=thread_count) + + +def override_if_not_in_args(flag, argument, args): + """Checks if flags is in args, and if not it adds the flag to args.""" + if flag not in args: + args.extend([flag, argument]) + + +def loss(loss_value): + """Calculates aggregated mean loss.""" + total_loss = tf.Variable(0.0, False) + loss_count = tf.Variable(0, False) + total_loss_update = tf.assign_add(total_loss, loss_value) + loss_count_update = tf.assign_add(loss_count, 1) + loss_op = total_loss / tf.cast(loss_count, tf.float32) + return [total_loss_update, loss_count_update], loss_op + + +def accuracy(logits, labels): + """Calculates aggregated accuracy.""" + is_correct = tf.nn.in_top_k(logits, labels, 1) + correct = tf.reduce_sum(tf.cast(is_correct, tf.int32)) + incorrect = tf.reduce_sum(tf.cast(tf.logical_not(is_correct), tf.int32)) + correct_count = tf.Variable(0, False) + incorrect_count = tf.Variable(0, False) + correct_count_update = tf.assign_add(correct_count, correct) + incorrect_count_update = tf.assign_add(incorrect_count, incorrect) + accuracy_op = tf.cast(correct_count, tf.float32) / tf.cast( + correct_count + incorrect_count, tf.float32) + return [correct_count_update, incorrect_count_update], accuracy_op + + +def check_dataset(dataset, mode): + """Validate we have a good dataset.""" + + names = [x['name'] for x in dataset.schema] + types = [x['type'] for x in dataset.schema] + if mode == 'train': + if (set(['image_url', 'label']) != set(names) or any (t != 'STRING' for t in types)): + raise ValueError('Invalid dataset. Expect only "image_url,label" STRING columns.') + else: + if ((set(['image_url']) != set(names) and set(['image_url', 'label']) != set(names)) or + any (t != 'STRING' for t in types)): + raise ValueError('Invalid dataset. Expect only "image_url" or "image_url,label" ' + + 'STRING columns.') + + +def get_sources_from_dataset(p, dataset, mode): + """get pcollection from dataset.""" + + import apache_beam as beam + import csv + from datalab.ml import CsvDataSet, BigQueryDataSet + + check_dataset(dataset, mode) + if type(dataset) is CsvDataSet: + source_list = [] + for ii, input_path in enumerate(dataset.files): + source_list.append(p | 'Read from Csv %d (%s)' % (ii, mode) >> + beam.io.ReadFromText(input_path, strip_trailing_newlines=True)) + return (source_list | 'Flatten Sources (%s)' % mode >> beam.Flatten() + | 'Create Dict from Csv (%s)' % mode >> + beam.Map(lambda line: csv.DictReader([line], fieldnames=['image_url', 'label']).next())) + elif type(dataset) is BigQueryDataSet: + bq_source = (beam.io.BigQuerySource(table=dataset.table) if dataset.table is not None else + beam.io.BigQuerySource(query=dataset.query)) + return p | 'Read source from BigQuery (%s)' % mode >> beam.io.Read(bq_source) + else: + raise ValueError('Invalid DataSet. Expect CsvDataSet or BigQueryDataSet') + + +def decode_and_resize(image_str_tensor): + """Decodes jpeg string, resizes it and returns a uint8 tensor.""" + + # These constants are set by Inception v3's expectations. + height = 299 + width = 299 + channels = 3 + + image = tf.image.decode_jpeg(image_str_tensor, channels=channels) + # Note resize expects a batch_size, but tf_map supresses that index, + # thus we have to expand then squeeze. Resize returns float32 in the + # range [0, uint8_max] + image = tf.expand_dims(image, 0) + image = tf.image.resize_bilinear(image, [height, width], align_corners=False) + image = tf.squeeze(image, squeeze_dims=[0]) + image = tf.cast(image, dtype=tf.uint8) + return image + + +def resize_image(image_str_tensor): + """Decodes jpeg string, resizes it and re-encode it to jpeg.""" + + image = decode_and_resize(image_str_tensor) + image = tf.image.encode_jpeg(image, quality=100) + return image + + +def load_images(image_files, resize=True): + """Load images from files and optionally resize it.""" + + images = [] + for image_file in image_files: + with open_local_or_gcs(image_file, 'r') as ff: + images.append(ff.read()) + if resize is False: + return images + + # To resize, run a tf session so we can reuse 'decode_and_resize()' + # which is used in prediction graph. This makes sure we don't lose + # any quality in prediction, while decreasing the size of the images + # submitted to the model over network. + image_str_tensor = tf.placeholder(tf.string, shape=[None]) + image = tf.map_fn(resize_image, image_str_tensor, back_prop=False) + feed_dict = collections.defaultdict(list) + feed_dict[image_str_tensor.name] = images + with tf.Session() as sess: + images_resized = sess.run(image, feed_dict=feed_dict) + return images_resized diff --git a/solutionbox/inception/datalab_solutions/inception/setup.py b/solutionbox/inception/datalab_solutions/inception/setup.py new file mode 100644 index 000000000..93d72fcc9 --- /dev/null +++ b/solutionbox/inception/datalab_solutions/inception/setup.py @@ -0,0 +1,54 @@ +# Copyright 2017 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. + +# To publish to PyPi use: python setup.py bdist_wheel upload -r pypi + +import datetime +from setuptools import setup + +minor = datetime.datetime.now().strftime("%y%m%d%H%M") +version = '0.1' + +setup( + name='inception', + version=version, + packages=[ + 'datalab_solutions', + 'datalab_solutions.inception', + ], + + description='Google Cloud Datalab Inception Package', + author='Google', + author_email='google-cloud-datalab-feedback@googlegroups.com', + keywords=[ + ], + license="Apache Software License", + classifiers=[ + "Programming Language :: Python", + "Programming Language :: Python :: 2", + "Development Status :: 4 - Beta", + "Environment :: Other Environment", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + "Topic :: Software Development :: Libraries :: Python Modules" + ], + long_description=""" + """, + install_requires=[ + 'tensorflow==1.0', + 'protobuf==3.1.0', + 'google-cloud-dataflow==0.5.5', + ], + package_data={ + } +) diff --git a/solutionbox/inception/datalab_solutions/inception/task.py b/solutionbox/inception/datalab_solutions/inception/task.py new file mode 100644 index 000000000..73434118b --- /dev/null +++ b/solutionbox/inception/datalab_solutions/inception/task.py @@ -0,0 +1,84 @@ +# Copyright 2017 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +"""Entry point for CloudML training. + + CloudML training requires a tarball package and a python module to run. This file + provides such a "main" method and a list of args passed with the program. +""" + +import argparse +import json +import logging +import os +import tensorflow as tf + +from . import _model +from . import _trainer +from . import _util + + +def main(_): + parser = argparse.ArgumentParser() + parser.add_argument( + '--input_dir', + type=str, + help='The input dir path for training and evaluation data.') + parser.add_argument( + '--output_path', + type=str, + help='The path to which checkpoints and other outputs ' + 'should be saved. This can be either a local or GCS ' + 'path.') + parser.add_argument( + '--max_steps', + type=int,) + parser.add_argument( + '--batch_size', + type=int, + help='Number of examples to be processed per mini-batch.') + parser.add_argument( + '--checkpoint', + type=str, + default=_util._DEFAULT_CHECKPOINT_GSURL, + help='Pretrained inception checkpoint path.') + + args, _ = parser.parse_known_args() + labels = _util.get_labels(args.input_dir) + model = _model.Model(labels, 0.5, args.checkpoint) + + env = json.loads(os.environ.get('TF_CONFIG', '{}')) + # Print the job data as provided by the service. + logging.info('Original job data: %s', env.get('job', {})) + task_data = env.get('task', None) or {'type': 'master', 'index': 0} + task = type('TaskSpec', (object,), task_data) + trial = task_data.get('trial') + if trial is not None: + args.output_path = os.path.join(args.output_path, trial) + + cluster_data = env.get('cluster', None) + cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None + if not cluster or not task or task.type == 'master' or task.type == 'worker': + _trainer.Trainer(args.input_dir, args.batch_size, args.max_steps, + args.output_path, model, cluster, task).run_training() + elif task.type == 'ps': + server = _trainer.start_server(cluster, task) + server.join() + else: + raise ValueError('invalid task_type %s' % (task.type,)) + +if __name__ == '__main__': + logging.basicConfig(level=logging.INFO) + tf.app.run() diff --git a/solutionbox/inception/setup.py b/solutionbox/inception/setup.py new file mode 100644 index 000000000..93d72fcc9 --- /dev/null +++ b/solutionbox/inception/setup.py @@ -0,0 +1,54 @@ +# Copyright 2017 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. + +# To publish to PyPi use: python setup.py bdist_wheel upload -r pypi + +import datetime +from setuptools import setup + +minor = datetime.datetime.now().strftime("%y%m%d%H%M") +version = '0.1' + +setup( + name='inception', + version=version, + packages=[ + 'datalab_solutions', + 'datalab_solutions.inception', + ], + + description='Google Cloud Datalab Inception Package', + author='Google', + author_email='google-cloud-datalab-feedback@googlegroups.com', + keywords=[ + ], + license="Apache Software License", + classifiers=[ + "Programming Language :: Python", + "Programming Language :: Python :: 2", + "Development Status :: 4 - Beta", + "Environment :: Other Environment", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + "Topic :: Software Development :: Libraries :: Python Modules" + ], + long_description=""" + """, + install_requires=[ + 'tensorflow==1.0', + 'protobuf==3.1.0', + 'google-cloud-dataflow==0.5.5', + ], + package_data={ + } +) diff --git a/solutionbox/structured_data/build.sh b/solutionbox/structured_data/build.sh new file mode 100755 index 000000000..7946fbf79 --- /dev/null +++ b/solutionbox/structured_data/build.sh @@ -0,0 +1,8 @@ +#! /bin/bash + + +rm -fr dist +cp setup.py datalab_solutions/structured_data/master_setup.py +python setup.py sdist + + diff --git a/solutionbox/structured_data/datalab_solutions/__init__.py b/solutionbox/structured_data/datalab_solutions/__init__.py new file mode 100644 index 000000000..3d74130ef --- /dev/null +++ b/solutionbox/structured_data/datalab_solutions/__init__.py @@ -0,0 +1,12 @@ +# Copyright 2017 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. + diff --git a/solutionbox/structured_data/datalab_solutions/structured_data/__init__.py b/solutionbox/structured_data/datalab_solutions/structured_data/__init__.py new file mode 100644 index 000000000..76a12ce46 --- /dev/null +++ b/solutionbox/structured_data/datalab_solutions/structured_data/__init__.py @@ -0,0 +1,18 @@ +# Copyright 2017 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. + + +from ._package import local_preprocess, cloud_preprocess, local_train, cloud_train, local_predict, \ + cloud_predict, local_batch_predict, cloud_batch_predict + +# Source of truth for the version of this package. +__version__ = '0.0.1' \ No newline at end of file diff --git a/solutionbox/structured_data/datalab_solutions/structured_data/_package.py b/solutionbox/structured_data/datalab_solutions/structured_data/_package.py new file mode 100644 index 000000000..5b33eea8b --- /dev/null +++ b/solutionbox/structured_data/datalab_solutions/structured_data/_package.py @@ -0,0 +1,586 @@ +# Copyright 2017 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +"""Provides interface for Datalab. + + Datalab will look for functions with the below names: + local_preprocess + local_train + local_predict + cloud_preprocess + cloud_train + cloud_predict +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import datetime +import logging +import os +import shutil +import subprocess +import sys +import tempfile +import urllib +import json +import glob +import StringIO +import subprocess + +import pandas as pd +import tensorflow as tf + +from tensorflow.python.lib.io import file_io + +from . import preprocess +from . import trainer +from . import predict + + +def _default_project(): + import datalab.context + context = datalab.context.Context.default() + return context.project_id + +def _is_in_IPython(): + try: + import IPython + return True + except ImportError: + return False + +def _assert_gcs_files(files): + """Check files starts wtih gs://. + + Args: + files: string to file path, or list of file paths. + """ + if isinstance(files, basestring): + files = [files] + + for f in files: + if f is not None and not f.startswith('gs://'): + raise ValueError('File %s is not a gcs path' % f) + + +def _package_to_staging(staging_package_url): + """Repackage this package from local installed location and copy it to GCS. + + Args: + staging_package_url: GCS path. + """ + import datalab.ml as ml + + # Find the package root. __file__ is under [package_root]/datalab_solutions/inception. + package_root = os.path.abspath( + os.path.join(os.path.dirname(__file__), '../../')) + setup_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), 'master_setup.py')) + tar_gz_path = os.path.join(staging_package_url, 'staging', 'sd.tar.gz') + + print('Building package and uploading to %s' % tar_gz_path) + ml.package_and_copy(package_root, setup_path, tar_gz_path) + + return tar_gz_path + + +def local_preprocess(output_dir, dataset): + """Preprocess data locally with Pandas + + Produce analysis used by training. + + Args: + output_dir: The output directory to use. + dataset: only CsvDataSet is supported currently. + """ + import datalab.ml as ml + if not isinstance(dataset, ml.CsvDataSet): + raise ValueError('Only CsvDataSet is supported') + + if len(dataset.input_files) != 1: + raise ValueError('CsvDataSet should be built with a file pattern, not a ' + 'list of files.') + + # Write schema to a file. + tmp_dir = tempfile.mkdtemp() + _, schema_file_path = tempfile.mkstemp(dir=tmp_dir, suffix='.json', + prefix='schema') + try: + file_io.write_string_to_file(schema_file_path, json.dumps(dataset.schema)) + + args = ['local_preprocess', + '--input_file_pattern=%s' % dataset.input_files[0], + '--output_dir=%s' % output_dir, + '--schema_file=%s' % schema_file_path] + + print('Starting local preprocessing.') + preprocess.local_preprocess.main(args) + print('Local preprocessing done.') + finally: + shutil.rmtree(tmp_dir) + +def cloud_preprocess(output_dir, dataset, project_id=None): + """Preprocess data in the cloud with BigQuery. + + Produce analysis used by training. This can take a while, even for small + datasets. For small datasets, it may be faster to use local_preprocess. + + Args: + output_dir: The output directory to use. + dataset: only CsvDataSet is supported currently. + project_id: project id the table is in. If none, uses the default project. + """ + import datalab.ml as ml + if not isinstance(dataset, ml.CsvDataSet): + raise ValueError('Only CsvDataSet is supported') + + if len(dataset.input_files) != 1: + raise ValueError('CsvDataSet should be built with a file pattern, not a ' + 'list of files.') + + _assert_gcs_files([output_dir, dataset.input_files[0]]) + + # Write schema to a file. + tmp_dir = tempfile.mkdtemp() + _, schema_file_path = tempfile.mkstemp(dir=tmp_dir, suffix='.json', + prefix='schema') + try: + file_io.write_string_to_file(schema_file_path, json.dumps(dataset.schema)) + + args = ['cloud_preprocess', + '--input_file_pattern=%s' % dataset.input_files[0], + '--output_dir=%s' % output_dir, + '--schema_file=%s' % schema_file_path] + + + print('Starting cloud preprocessing.') + print('Track BigQuery status at') + print('https://bigquery.cloud.google.com/queries/%s' % _default_project()) + preprocess.cloud_preprocess.main(args) + print('Cloud preprocessing done.') + finally: + shutil.rmtree(tmp_dir) + + +def local_train(train_dataset, + eval_dataset, + preprocess_output_dir, + output_dir, + transforms, + model_type, + max_steps=5000, + num_epochs=None, + train_batch_size=100, + eval_batch_size=100, + min_eval_frequency=100, + top_n=None, + layer_sizes=None, + learning_rate=0.01, + epsilon=0.0005): + """Train model locally. + Args: + train_dataset: CsvDataSet + eval_dataset: CsvDataSet + preprocess_output_dir: The output directory from preprocessing + output_dir: Output directory of training. + transforms: file path or transform object. Example: + { + "col_A": {"transform": "scale", "default": 0.0}, + "col_B": {"transform": "scale","value": 4}, + # Note col_C is missing, so default transform used. + "col_D": {"transform": "hash_one_hot", "hash_bucket_size": 4}, + "col_target": {"transform": "target"}, + "col_key": {"transform": "key"} + } + The keys correspond to the columns in the input files as defined by the + schema file during preprocessing. Some notes + 1) The "key" and "target" transforms are required. + 2) Default values are optional. These are used if the input data has + missing values during training and prediction. If not supplied for a + column, the default value for a numerical column is that column's + mean vlaue, and for a categorical column the empty string is used. + 3) For numerical colums, the following transforms are supported: + i) {"transform": "identity"}: does nothing to the number. (default) + ii) {"transform": "scale"}: scales the colum values to -1, 1. + iii) {"transform": "scale", "value": a}: scales the colum values + to -a, a. + + For categorical colums, the transform supported depends on if the + model is a linear or DNN model because tf.layers is uesed. + For a linear model, the transforms supported are: + i) {"transform": "sparse"}: Makes a sparse vector using the full + vocabulary associated with the column (default). + ii) {"transform": "hash_sparse", "hash_bucket_size": n}: First each + string is hashed to an integer in the range [0, n), and then a + sparse vector is used. + + For a DNN model, the categorical transforms that are supported are: + i) {"transform": "one_hot"}: A one-hot vector using the full + vocabulary is used. (default) + ii) {"transform": "embedding", "embedding_dim": d}: Each label is + embedded into an d-dimensional space. + iii) {"transform": "hash_one_hot", "hash_bucket_size": n}: The label + is first hashed into the range [0, n) and then a one-hot encoding + is made. + iv) {"transform": "hash_embedding", "hash_bucket_size": n, + "embedding_dim": d}: First each label is hashed to [0, n), and + then each integer is embedded into a d-dimensional space. + model_type: One of linear_classification, linear_regression, + dnn_classification, dnn_regression. + max_steps: Int. Number of training steps to perform. + num_epochs: Maximum number of training data epochs on which to train. + The training job will run for max_steps or num_epochs, whichever occurs + first. + train_batch_size: number of rows to train on in one step. + eval_batch_size: number of rows to eval in one step. + min_eval_frequency: Minimum number of training steps between evaluations. + top_n: Int. For classification problems, the output graph will contain the + labels and scores for the top n classes with a default of n=1. Use + None for regression problems. + layer_sizes: List. Represents the layers in the connected DNN. + If the model type is DNN, this must be set. Example [10, 3, 2], this + will create three DNN layers where the first layer will have 10 nodes, + the middle layer will have 3 nodes, and the laster layer will have 2 + nodes. + learning_rate: tf.train.AdamOptimizer's learning rate, + epsilon: tf.train.AdamOptimizer's epsilon value. + """ + if len(train_dataset.input_files) != 1 or len(eval_dataset.input_files) != 1: + raise ValueError('CsvDataSets must be built with a file pattern, not list ' + 'of files.') + + if file_io.file_exists(output_dir): + raise ValueError('output_dir already exist. Use a new output path.') + + if isinstance(transforms, dict): + # Make a transforms file. + if not file_io.file_exists(output_dir): + file_io.recursive_create_dir(output_dir) + transforms_file = os.path.join(output_dir, 'transforms_file.json') + file_io.write_string_to_file( + transforms_file, + json.dumps(transforms)) + else: + transforms_file = transforms + + args = ['local_train', + '--train_data_paths=%s' % train_dataset.input_files[0], + '--eval_data_paths=%s' % eval_dataset.input_files[0], + '--output_path=%s' % output_dir, + '--preprocess_output_dir=%s' % preprocess_output_dir, + '--transforms_file=%s' % transforms_file, + '--model_type=%s' % model_type, + '--max_steps=%s' % str(max_steps), + '--train_batch_size=%s' % str(train_batch_size), + '--eval_batch_size=%s' % str(eval_batch_size), + '--min_eval_frequency=%s' % str(min_eval_frequency), + '--learning_rate=%s' % str(learning_rate), + '--epsilon=%s' % str(epsilon)] + if num_epochs: + args.append('--num_epochs=%s' % str(num_epochs)) + if top_n: + args.append('--top_n=%s' % str(top_n)) + if layer_sizes: + for i in range(len(layer_sizes)): + args.append('--layer_size%s=%s' % (i+1, str(layer_sizes[i]))) + + stderr = sys.stderr + sys.stderr = sys.stdout + print('Starting local training.') + trainer.task.main(args) + print('Local training done.') + sys.stderr = stderr + +def cloud_train(train_dataset, + eval_dataset, + preprocess_output_dir, + output_dir, + transforms, + model_type, + cloud_training_config, + max_steps=5000, + num_epochs=None, + train_batch_size=100, + eval_batch_size=100, + min_eval_frequency=100, + top_n=None, + layer_sizes=None, + learning_rate=0.01, + epsilon=0.0005, + job_name=None): + """Train model using CloudML. + + See local_train() for a description of the args. + Args: + cloud_training_config: A CloudTrainingConfig object. + job_name: Training job name. A default will be picked if None. + """ + import datalab + + if len(train_dataset.input_files) != 1 or len(eval_dataset.input_files) != 1: + raise ValueError('CsvDataSets must be built with a file pattern, not list ' + 'of files.') + + if file_io.file_exists(output_dir): + raise ValueError('output_dir already exist. Use a new output path.') + + if isinstance(transforms, dict): + # Make a transforms file. + if not file_io.file_exists(output_dir): + file_io.recursive_create_dir(output_dir) + transforms_file = os.path.join(output_dir, 'transforms_file.json') + file_io.write_string_to_file( + transforms_file, + json.dumps(transforms)) + else: + transforms_file = transforms + + _assert_gcs_files([output_dir, train_dataset.input_files[0], + eval_dataset.input_files[0], transforms_file, + preprocess_output_dir]) + + args = ['--train_data_paths=%s' % train_dataset.input_files[0], + '--eval_data_paths=%s' % eval_dataset.input_files[0], + '--output_path=%s' % output_dir, + '--preprocess_output_dir=%s' % preprocess_output_dir, + '--transforms_file=%s' % transforms_file, + '--model_type=%s' % model_type, + '--max_steps=%s' % str(max_steps), + '--train_batch_size=%s' % str(train_batch_size), + '--eval_batch_size=%s' % str(eval_batch_size), + '--min_eval_frequency=%s' % str(min_eval_frequency), + '--learning_rate=%s' % str(learning_rate), + '--epsilon=%s' % str(epsilon)] + if num_epochs: + args.append('--num_epochs=%s' % str(num_epochs)) + if top_n: + args.append('--top_n=%s' % str(top_n)) + if layer_sizes: + for i in range(len(layer_sizes)): + args.append('--layer_size%s=%s' % (i+1, str(layer_sizes[i]))) + + job_request = { + 'package_uris': [_package_to_staging(output_dir)], + 'python_module': 'datalab_solutions.structured_data.trainer.task', + 'args': args + } + job_request.update(dict(cloud_training_config._asdict())) + + if not job_name: + job_name = 'structured_data_train_' + datetime.datetime.now().strftime('%y%m%d_%H%M%S') + job = datalab.ml.Job.submit_training(job_request, job_name) + print('Job request send. View status of job at') + print('https://console.developers.google.com/ml/jobs?project=%s' % + _default_project()) + + return job + + +def local_predict(training_ouput_dir, data): + """Runs local prediction on the prediction graph. + + Runs local prediction and returns the result in a Pandas DataFrame. For + running prediction on a large dataset or saving the results, run + local_batch_prediction or batch_prediction. Input data should fully match + the schema that was used at training, except the target column should not + exist. + + Args: + training_ouput_dir: local path to the trained output folder. + data: List of csv strings or a Pandas DataFrame that match the model schema. + + """ + # Save the instances to a file, call local batch prediction, and print it back + tmp_dir = tempfile.mkdtemp() + _, input_file_path = tempfile.mkstemp(dir=tmp_dir, suffix='.csv', + prefix='input') + + try: + if isinstance(data, pd.DataFrame): + data.to_csv(input_file_path, header=False, index=False) + else: + with open(input_file_path, 'w') as f: + for line in data: + f.write(line + '\n') + + model_dir = os.path.join(training_ouput_dir, 'model') + if not file_io.file_exists(model_dir): + raise ValueError('training_ouput_dir should contain the folder model') + + cmd = ['predict.py', + '--predict_data=%s' % input_file_path, + '--trained_model_dir=%s' % model_dir, + '--output_dir=%s' % tmp_dir, + '--output_format=csv', + '--batch_size=100', + '--mode=prediction', + '--no-shard_files'] + + print('Starting local prediction.') + predict.predict.main(cmd) + print('Local prediction done.') + + # Read the header file. + schema_file = os.path.join(tmp_dir, 'csv_schema.json') + with open(schema_file, 'r') as f: + schema = json.loads(f.read()) + + # Print any errors to the screen. + errors_file = glob.glob(os.path.join(tmp_dir, 'errors*')) + if errors_file and os.path.getsize(errors_file[0]) > 0: + print('Warning: there are errors. See below:') + with open(errors_file[0], 'r') as f: + text = f.read() + print(text) + + # Read the predictions data. + prediction_file = glob.glob(os.path.join(tmp_dir, 'predictions*')) + if not prediction_file: + raise FileNotFoundError('Prediction results not found') + predictions = pd.read_csv(prediction_file[0], + header=None, + names=[col['name'] for col in schema]) + return predictions + finally: + shutil.rmtree(tmp_dir) + + +def cloud_predict(model_name, model_version, data): + """Use Online prediction. + + Runs online prediction in the cloud and prints the results to the screen. For + running prediction on a large dataset or saving the results, run + local_batch_prediction or batch_prediction. + + Args: + model_name: deployed model name + model_version: depoyed model version + data: List of csv strings or a Pandas DataFrame that match the model schema. + + Before using this, the model must be created. This can be done by running + two gcloud commands: + 1) gcloud beta ml models create NAME + 2) gcloud beta ml versions create VERSION --model NAME \ + --origin gs://BUCKET/training_output_dir/model + or these datalab commands: + 1) import datalab + model = datalab.ml.ModelVersions(MODEL_NAME) + model.deploy(version_name=VERSION, + path='gs://BUCKET/training_output_dir/model') + Note that the model must be on GCS. + """ + import datalab.ml as ml + + + if isinstance(data, pd.DataFrame): + # write the df to csv. + string_buffer = StringIO.StringIO() + data.to_csv(string_buffer, header=None, index=False) + input_data = string_buffer.getvalue().split('\n') + + #remove empty strings + input_data = [line for line in input_data if line] + else: + input_data = data + + predictions = ml.ModelVersions(model_name).predict(model_version, input_data) + + # Convert predictions into a dataframe + df = pd.DataFrame(columns=sorted(predictions[0].keys())) + for i in range(len(predictions)): + for k, v in predictions[i].iteritems(): + df.loc[i, k] = v + return df + + +def local_batch_predict(training_ouput_dir, prediction_input_file, output_dir, + mode, + batch_size=1000, shard_files=True, output_format='csv'): + """Local batch prediction. + + Args: + training_ouput_dir: The output folder of training. + prediction_input_file: csv file pattern to a local file. + output_dir: output location to save the results. + mode: 'evaluation' or 'prediction'. If 'evaluation', the input data must + contain a target column. If 'prediction', the input data must not + contain a target column. + batch_size: Int. How many instances to run in memory at once. Larger values + mean better performace but more memeory consumed. + shard_files: If false, the output files are not shardded. + output_format: csv or json. Json file are json-newlined. + """ + + if mode == 'evaluation': + model_dir = os.path.join(training_ouput_dir, 'evaluation_model') + elif mode == 'prediction': + model_dir = os.path.join(training_ouput_dir, 'model') + else: + raise ValueError('mode must be evaluation or prediction') + + if not file_io.file_exists(model_dir): + raise ValueError('Model folder %s does not exist' % model_dir) + + cmd = ['predict.py', + '--predict_data=%s' % prediction_input_file, + '--trained_model_dir=%s' % model_dir, + '--output_dir=%s' % output_dir, + '--output_format=%s' % output_format, + '--batch_size=%s' % str(batch_size), + '--shard_files' if shard_files else '--no-shard_files', + '--has_target' if mode == 'evaluation' else '--no-has_target' + ] + + print('Starting local batch prediction.') + predict.predict.main(cmd) + print('Local batch prediction done.') + + + +def cloud_batch_predict(training_ouput_dir, prediction_input_file, output_dir, + mode, + batch_size=1000, shard_files=True, output_format='csv'): + """Cloud batch prediction. Submitts a Dataflow job. + + See local_batch_predict() for a description of the args. + """ + if mode == 'evaluation': + model_dir = os.path.join(training_ouput_dir, 'evaluation_model') + elif mode == 'prediction': + model_dir = os.path.join(training_ouput_dir, 'model') + else: + raise ValueError('mode must be evaluation or prediction') + + if not file_io.file_exists(model_dir): + raise ValueError('Model folder %s does not exist' % model_dir) + + _assert_gcs_files([training_ouput_dir, prediction_input_file, + output_dir]) + + cmd = ['predict.py', + '--cloud', + '--project_id=%s' % _default_project(), + '--predict_data=%s' % prediction_input_file, + '--trained_model_dir=%s' % model_dir, + '--output_dir=%s' % output_dir, + '--output_format=%s' % output_format, + '--batch_size=%s' % str(batch_size), + '--shard_files' if shard_files else '--no-shard_files', + '--extra_package=%s' % _package_to_staging(output_dir)] + + print('Starting cloud batch prediction.') + predict.predict.main(cmd) + print('See above link for job status.') diff --git a/solutionbox/structured_data/datalab_solutions/structured_data/master_setup.py b/solutionbox/structured_data/datalab_solutions/structured_data/master_setup.py new file mode 100644 index 000000000..aee640276 --- /dev/null +++ b/solutionbox/structured_data/datalab_solutions/structured_data/master_setup.py @@ -0,0 +1,73 @@ +# Copyright 2017 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. + +# A copy of this file must be made in datalab_solutions/structured_data/setup.py + +import datetime +import os +import re +from setuptools import setup + + + +# The version is saved in an __init__ file. +def get_version(): + VERSIONFILE = os.path.join('datalab_solutions/structured_data/', + '__init__.py') + if not os.path.isfile(VERSIONFILE): + raise ValueError('setup.py: File not found %s' % VERSIONFILE) + initfile_lines = open(VERSIONFILE, 'rt').readlines() + VSRE = r"^__version__ = ['\"]([^'\"]*)['\"]" + for line in initfile_lines: + mo = re.search(VSRE, line, re.M) + if mo: + return mo.group(1) + raise RuntimeError('Unable to find version string in %s.' % (VERSIONFILE,)) + + +setup( + name='structured_data', + version=get_version(), + packages=[ + 'datalab_solutions', + 'datalab_solutions.structured_data', + 'datalab_solutions.structured_data.trainer', + 'datalab_solutions.structured_data.preprocess', + 'datalab_solutions.structured_data.predict', + ], + description='Google Cloud Datalab Structured Data Package', + author='Google', + author_email='google-cloud-datalab-feedback@googlegroups.com', + keywords=[ + ], + license="Apache Software License", + classifiers=[ + "Programming Language :: Python", + "Programming Language :: Python :: 2", + "Development Status :: 4 - Beta", + "Environment :: Other Environment", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + "Topic :: Software Development :: Libraries :: Python Modules" + ], + long_description=""" + """, + install_requires=[ + 'tensorflow==1.0', + 'protobuf==3.1.0', + 'google-cloud-dataflow==0.5.5' + ], + package_data={ + }, + data_files=[], +) diff --git a/solutionbox/structured_data/datalab_solutions/structured_data/predict/__init__.py b/solutionbox/structured_data/datalab_solutions/structured_data/predict/__init__.py new file mode 100644 index 000000000..b1c31965d --- /dev/null +++ b/solutionbox/structured_data/datalab_solutions/structured_data/predict/__init__.py @@ -0,0 +1,16 @@ +# Copyright 2017 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +import predict + diff --git a/solutionbox/structured_data/datalab_solutions/structured_data/predict/predict.py b/solutionbox/structured_data/datalab_solutions/structured_data/predict/predict.py new file mode 100644 index 000000000..458d5e677 --- /dev/null +++ b/solutionbox/structured_data/datalab_solutions/structured_data/predict/predict.py @@ -0,0 +1,415 @@ +# Copyright 2017 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Runs prediction on a trained model.""" + + +import argparse +import datetime +import os +import sys + +import apache_beam as beam + + +def parse_arguments(argv): + """Parse command line arguments. + + Args: + argv: includes the script's name. + + Returns: + argparse object + """ + parser = argparse.ArgumentParser( + description='Runs Prediction inside a beam or Dataflow job.') + # cloud options + parser.add_argument('--project_id', + help='The project to which the job will be submitted.') + parser.add_argument('--cloud', + action='store_true', + help='Run preprocessing on the cloud.') + parser.add_argument('--job_name', + default=('structured-data-batch-prediction-' + + datetime.datetime.now().strftime('%Y%m%d%H%M%S')), + help='Dataflow job name. Must be unique over all jobs.') + parser.add_argument('--extra_package', + default=[], + action='append', + help=('If using --cloud, also installs these packages on ' + 'each dataflow worker')) + + # I/O args + parser.add_argument('--predict_data', + required=True, + help='Data to run prediction on') + parser.add_argument('--trained_model_dir', + required=True, + help='Usually train_output_path/model.') + parser.add_argument('--output_dir', + required=True, + help=('Location to save output.')) + + # Other args + parser.add_argument('--batch_size', + required=False, + default=1000, + type=int, + help=('Batch size. Larger values consumes more memrory ' + 'but takes less time to finish.')) + parser.add_argument('--shard_files', + dest='shard_files', + action='store_true', + help='Shard files') + parser.add_argument('--no-shard_files', + dest='shard_files', + action='store_false', + help='Don\'t shard files') + parser.set_defaults(shard_files=True) + + parser.add_argument('--output_format', + choices=['csv', 'json'], + default='csv', + help=""" + The output results. + raw_json: produces a newline file where each line is json. No + post processing is performed and the output matches what the trained + model produces. + csv: produces a csv file without a header row and a header csv file. + For classification problems, the vector of probabalities for each + target class is split into individual csv columns.""") + + args, _ = parser.parse_known_args(args=argv[1:]) + + if args.cloud: + if not args.project_id: + raise ValueError('--project_id needed with --cloud') + if not args.trained_model_dir.startswith('gs://'): + raise ValueError('trained_model_dir needs to be a GCS path,') + if not args.output_dir.startswith('gs://'): + raise ValueError('output_dir needs to be a GCS path.') + if not args.predict_data.startswith('gs://'): + raise ValueError('predict_data needs to be a GCS path.') + + + return args + + +class EmitAsBatchDoFn(beam.DoFn): + """A DoFn that buffers the records and emits them batch by batch.""" + + def __init__(self, batch_size): + """Constructor of EmitAsBatchDoFn beam.DoFn class. + + Args: + batch_size: the max size we want to buffer the records before emitting. + """ + self._batch_size = batch_size + self._cached = [] + + def process(self, element): + self._cached.append(element) + if len(self._cached) >= self._batch_size: + emit = self._cached + self._cached = [] + yield emit + + def finish_bundle(self, element=None): + if len(self._cached) > 0: # pylint: disable=g-explicit-length-test + yield self._cached + + +class RunGraphDoFn(beam.DoFn): + """A DoFn for running the TF graph.""" + + def __init__(self, trained_model_dir): + self._trained_model_dir = trained_model_dir + self._session = None + + def start_bundle(self, element=None): + from tensorflow.python.saved_model import tag_constants + from tensorflow.contrib.session_bundle import bundle_shim + import json + + self._session, meta_graph = bundle_shim.load_session_bundle_or_saved_model_bundle_from_path(self._trained_model_dir, tags=[tag_constants.SERVING]) + signature = meta_graph.signature_def['serving_default'] + + # get the mappings between aliases and tensor names + # for both inputs and outputs + self._input_alias_map = {friendly_name: tensor_info_proto.name + for (friendly_name, tensor_info_proto) in signature.inputs.items() } + self._output_alias_map = {friendly_name: tensor_info_proto.name + for (friendly_name, tensor_info_proto) in signature.outputs.items() } + self._aliases, self._tensor_names = zip(*self._output_alias_map.items()) + + + def finish_bundle(self, element=None): + self._session.close() + + + def process(self, element): + """Run batch prediciton on a TF graph. + + Args: + element: list of strings, representing one batch input to the TF graph. + """ + import collections + import apache_beam as beam + + num_in_batch = 0 + try: + assert self._session is not None + + feed_dict = collections.defaultdict(list) + for line in element: + + # Remove trailing newline. + if line.endswith('\n'): + line = line[:-1] + + feed_dict[self._input_alias_map.values()[0]].append(line) + num_in_batch += 1 + + # batch_result is list of numpy arrays with batch_size many rows. + batch_result = self._session.run(fetches=self._tensor_names, + feed_dict=feed_dict) + + # ex batch_result for batch_size > 1: + # (array([value1, value2, ..., value_batch_size]), + # array([[a1, b1, c1]], ..., [a_batch_size, b_batch_size, c_batch_size]]), + # ...) + # ex batch_result for batch_size == 1: + # (value, + # array([a1, b1, c1]), + # ...) + + # Convert the results into a dict and unbatch the results. + if num_in_batch > 1: + for result in zip(*batch_result): + predictions = {} + for name, value in zip(self._aliases, result): + predictions[name] = (value.tolist() if getattr(value, 'tolist', None) + else value) + yield predictions + else: + predictions = {} + for i in range(len(self._aliases)): + value = batch_result[i] + value = (value.tolist() if getattr(value, 'tolist', None) + else value) + predictions[self._aliases[i]] = value + yield predictions + + except Exception as e: # pylint: disable=broad-except + yield beam.pvalue.SideOutputValue('errors', + (str(e), element)) + + +class RawJsonCoder(beam.coders.Coder): + """Coder for json newline files.""" + + def encode(self, obj): + """Encodes a python object into a JSON string. + + Args: + obj: python object. + + Returns: + JSON string. + """ + import json + return json.dumps(obj, separators=(',', ': ')) + + +class CSVCoder(beam.coders.Coder): + """Coder for CSV files containing the ouput of prediction.""" + + def __init__(self, header): + """Sets the headers in the csv file. + + Args: + header: list of strings that correspond to keys in the predictions dict. + """ + self._header = header + + def make_header_string(self): + return ','.join(self._header) + + def encode(self, tf_graph_predictions): + """Encodes the graph json prediction into csv. + + Args: + tf_graph_predictions: python dict. + + Returns: + csv string. + """ + row = [] + for col in self._header: + row.append(str(tf_graph_predictions[col])) + + return ','.join(row) + + +class FormatAndSave(beam.PTransform): + + def __init__(self, args): + self._shard_name_template = None if args.shard_files else '' + self._output_format = args.output_format + self._output_dir = args.output_dir + + # Get the BQ schema if csv. + if self._output_format == 'csv': + from tensorflow.python.saved_model import tag_constants + from tensorflow.contrib.session_bundle import bundle_shim + from tensorflow.core.framework import types_pb2 + + session, meta_graph = bundle_shim.load_session_bundle_or_saved_model_bundle_from_path(args.trained_model_dir, tags=[tag_constants.SERVING]) + signature = meta_graph.signature_def['serving_default'] + + self._schema = [] + for friendly_name in sorted(signature.outputs): + tensor_info_proto = signature.outputs[friendly_name] + + # TODO(brandondutra): Could dtype be DT_INVALID? + # Consider getting the dtype from the graph via + # session.graph.get_tensor_by_name(tensor_info_proto.name).dtype) + dtype = tensor_info_proto.dtype + if dtype == types_pb2.DT_FLOAT or dtype == types_pb2.DT_DOUBLE: + bq_type = 'FLOAT' + elif dtype == types_pb2.DT_INT32 or dtype == types_pb2.DT_INT64: + bq_type = 'INTEGER' + else: + bq_type = 'STRING' + + self._schema.append({'mode': 'NULLABLE', + 'name': friendly_name, + 'type': bq_type}) + session.close() + + def apply(self, datasets): + return self.expand(datasets) + + def expand(self, datasets): + import json + + tf_graph_predictions, errors = datasets + + if self._output_format == 'json': + _ = ( + tf_graph_predictions + | 'Write Raw JSON' + >> beam.io.textio.WriteToText( + os.path.join(self._output_dir, 'predictions'), + file_name_suffix='.json', + coder=RawJsonCoder(), + shard_name_template=self._shard_name_template)) + elif self._output_format == 'csv': + # make a csv header file + header = [col['name'] for col in self._schema] + csv_coder = CSVCoder(header) + _ = ( + tf_graph_predictions.pipeline + | 'Make CSV Header' + >> beam.Create([json.dumps(self._schema, indent=2)]) + | 'Write CSV Schema File' + >> beam.io.textio.WriteToText( + os.path.join(self._output_dir, 'csv_schema'), + file_name_suffix='.json', + shard_name_template='')) + + # Write the csv predictions + _ = ( + tf_graph_predictions + | 'Write CSV' + >> beam.io.textio.WriteToText( + os.path.join(self._output_dir, 'predictions'), + file_name_suffix='.csv', + coder=csv_coder, + shard_name_template=self._shard_name_template)) + else: + raise ValueError('FormatAndSave: unknown format %s', self._output_format) + + + # Write the errors to a text file. + _ = (errors + | 'Write Errors' + >> beam.io.textio.WriteToText( + os.path.join(self._output_dir, 'errors'), + file_name_suffix='.txt', + shard_name_template=self._shard_name_template)) + + +def make_prediction_pipeline(pipeline, args): + """Builds the prediction pipeline. + + Reads the csv files, prepends a ',' if the target column is missing, run + prediction, and then prints the formated results to a file. + + Args: + pipeline: the pipeline + args: command line args + """ + predicted_values, errors = ( + pipeline + | 'Read CSV Files' + >> beam.io.ReadFromText(str(args.predict_data), # DF bug: DF does not work with unicode strings + strip_trailing_newlines=True) + | 'Batch Input' + >> beam.ParDo(EmitAsBatchDoFn(args.batch_size)) + | 'Run TF Graph on Batches' + >> (beam.ParDo(RunGraphDoFn(args.trained_model_dir)) + .with_outputs('errors', main='main'))) + + _ = ( + (predicted_values, errors) + | 'Format and Save' + >> FormatAndSave(args)) + + +def main(argv=None): + args = parse_arguments(sys.argv if argv is None else argv) + + if args.cloud: + options = { + 'staging_location': os.path.join(args.output_dir, 'tmp', 'staging'), + 'temp_location': os.path.join(args.output_dir, 'tmp', 'staging'), + 'job_name': args.job_name, + 'project': args.project_id, + 'no_save_main_session': True, + 'extra_packages': args.extra_package, + 'teardown_policy': 'TEARDOWN_ALWAYS', + } + opts = beam.pipeline.PipelineOptions(flags=[], **options) + # Or use BlockingDataflowPipelineRunner + p = beam.Pipeline('DataflowRunner', options=opts) + else: + p = beam.Pipeline('DirectRunner') + + make_prediction_pipeline(p, args) + + if args.cloud: + print(('Dataflow Job submitted, see Job %s at ' + 'https://console.developers.google.com/dataflow?project=%s') % + (options['job_name'], args.project_id)) + sys.stdout.flush() + + r = p.run() + try: + r.wait_until_finish() + except AttributeError: + pass + + +if __name__ == '__main__': + main() diff --git a/solutionbox/structured_data/datalab_solutions/structured_data/preprocess/__init__.py b/solutionbox/structured_data/datalab_solutions/structured_data/preprocess/__init__.py new file mode 100644 index 000000000..feba1d08c --- /dev/null +++ b/solutionbox/structured_data/datalab_solutions/structured_data/preprocess/__init__.py @@ -0,0 +1,16 @@ +# Copyright 2017 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +import cloud_preprocess +import local_preprocess \ No newline at end of file diff --git a/solutionbox/structured_data/datalab_solutions/structured_data/preprocess/cloud_preprocess.py b/solutionbox/structured_data/datalab_solutions/structured_data/preprocess/cloud_preprocess.py new file mode 100644 index 000000000..2c86497b5 --- /dev/null +++ b/solutionbox/structured_data/datalab_solutions/structured_data/preprocess/cloud_preprocess.py @@ -0,0 +1,273 @@ +# Copyright 2017 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import json +import os +import pandas as pd +import StringIO +import sys + + +from tensorflow.python.lib.io import file_io + +SCHEMA_FILE = 'schema.json' +NUMERICAL_ANALYSIS_FILE = 'numerical_analysis.json' +CATEGORICAL_ANALYSIS_FILE = 'vocab_%s.csv' + + +def parse_arguments(argv): + """Parse command line arguments. + + Args: + argv: list of command line arguments, includeing programe name. + + Returns: + An argparse Namespace object. + + Raises: + ValueError: for bad parameters + """ + parser = argparse.ArgumentParser( + description='Runs Preprocessing on structured data.') + parser.add_argument('--output_dir', + type=str, + required=True, + help='Google Cloud Storage which to place outputs.') + + parser.add_argument('--schema_file', + type=str, + required=False, + help=('BigQuery json schema file')) + parser.add_argument('--input_file_pattern', + type=str, + required=False, + help='Input CSV file names. May contain a file pattern') + + # If using bigquery table + # TODO(brandondutra): maybe also support an sql input, so the table can be + # ad-hoc. + parser.add_argument('--bigquery_table', + type=str, + required=False, + help=('project:dataset.table_name')) + + args = parser.parse_args(args=argv[1:]) + + if not args.output_dir.startswith('gs://'): + raise ValueError('--output_dir must point to a location on GCS') + + if args.bigquery_table: + if args.schema_file or args.input_file_pattern: + raise ValueError('If using --bigquery_table, then --schema_file and ' + '--input_file_pattern, ' + 'are not needed.') + else: + if not args.schema_file or not args.input_file_pattern: + raise ValueError('If not using --bigquery_table, then --schema_file and ' + '--input_file_pattern ' + 'are required.') + + if not args.input_file_pattern.startswith('gs://'): + raise ValueError('--input_file_pattern must point to files on GCS') + + return args + + +def parse_table_name(bigquery_table): + """Giving a string a:b.c, returns b.c. + + Args: + bigquery_table: full table name project_id:dataset:table + + Returns: + dataset:table + + Raises: + ValueError: if a, b, or c contain the character ':'. + """ + + id_name = bigquery_table.split(':') + if len(id_name) != 2: + raise ValueError('Bigquery table name should be in the form ' + 'project_id:dataset.table_name. Got %s' % bigquery_table) + return id_name[1] + + +def run_numerical_analysis(table, schema_list, args): + """Find min/max values for the numerical columns and writes a json file. + + Args: + table: Reference to FederatedTable (if bigquery_table is false) or a + regular Table (otherwise) + schema_list: Bigquery schema json object + args: the command line args + """ + import datalab.bigquery as bq + + # Get list of numerical columns. + numerical_columns = [] + for col_schema in schema_list: + col_type = col_schema['type'].lower() + if col_type == 'integer' or col_type == 'float': + numerical_columns.append(col_schema['name']) + + + # Run the numerical analysis + if numerical_columns: + sys.stdout.write('Running numerical analysis...') + max_min = [ + ('max({name}) as max_{name}, ' + 'min({name}) as min_{name}, ' + 'avg({name}) as avg_{name} ').format(name=name) + for name in numerical_columns] + if args.bigquery_table: + sql = 'SELECT %s from %s' % (', '.join(max_min), + parse_table_name(args.bigquery_table)) + numerical_results = bq.Query(sql).to_dataframe() + else: + sql = 'SELECT %s from csv_table' % ', '.join(max_min) + query = bq.Query(sql, data_sources={'csv_table': table}) + numerical_results = query.to_dataframe() + + # Convert the numerical results to a json file. + results_dict = {} + for name in numerical_columns: + results_dict[name] = {'max': numerical_results.iloc[0]['max_%s' % name], + 'min': numerical_results.iloc[0]['min_%s' % name], + 'mean':numerical_results.iloc[0]['avg_%s' % name]} + + file_io.write_string_to_file( + os.path.join(args.output_dir, NUMERICAL_ANALYSIS_FILE), + json.dumps(results_dict, indent=2, separators=(',', ': '))) + + sys.stdout.write('done.\n') + + +def run_categorical_analysis(table, schema_list, args): + """Find vocab values for the categorical columns and writes a csv file. + + The vocab files are in the from + label1 + label2 + label3 + ... + + Args: + table: Reference to FederatedTable (if bigquery_table is false) or a + regular Table (otherwise) + schema_list: Bigquery schema json object + args: the command line args + """ + import datalab.bigquery as bq + + + # Get list of categorical columns. + categorical_columns = [] + for col_schema in schema_list: + col_type = col_schema['type'].lower() + if col_type == 'string': + categorical_columns.append(col_schema['name']) + + if categorical_columns: + sys.stdout.write('Running categorical analysis...') + for name in categorical_columns: + if args.bigquery_table: + table_name = parse_table_name(args.bigquery_table) + else: + table_name = 'table_name' + + sql = """ + SELECT + {name}, + FROM + {table} + WHERE + {name} IS NOT NULL + GROUP BY + {name} + ORDER BY + {name} + """.format(name=name, table=table_name) + out_file = os.path.join(args.output_dir, + CATEGORICAL_ANALYSIS_FILE % name) + + # extract_async seems to have a bug and sometimes hangs. So get the + # results direclty. + if args.bigquery_table: + df = bq.Query(sql).to_dataframe() + else: + query = bq.Query(sql, data_sources={table_name: table}) + df = query.to_dataframe() + + # Write the results to a file. + string_buff = StringIO.StringIO() + df.to_csv(string_buff, index=False, header=False) + file_io.write_string_to_file(out_file, string_buff.getvalue()) + + + sys.stdout.write('done.\n') + + +def run_analysis(args): + """Builds an analysis file for training. + + Uses BiqQuery tables to do the analysis. + + Args: + args: command line args + + Raises: + ValueError if schema contains unknown types. + """ + import datalab.bigquery as bq + if args.bigquery_table: + table = bq.Table(args.bigquery_table) + schema_list = table.schema._bq_schema + else: + schema_list = json.loads(file_io.read_file_to_string(args.schema_file)) + table = bq.FederatedTable().from_storage( + source=args.input_file_pattern, + source_format='csv', + ignore_unknown_values=False, + max_bad_records=0, + compressed=False, + schema=bq.Schema(schema_list)) + + # Check the schema is supported. + for col_schema in schema_list: + col_type = col_schema['type'].lower() + if col_type != 'string' and col_type != 'integer' and col_type != 'float': + raise ValueError('Unknown schema type %s' % col_type) + + run_numerical_analysis(table, schema_list, args) + run_categorical_analysis(table, schema_list, args) + + # Save a copy of the schema to the output location. + file_io.write_string_to_file( + os.path.join(args.output_dir, SCHEMA_FILE), + json.dumps(schema_list, indent=2, separators=(',', ': '))) + + +def main(argv=None): + args = parse_arguments(sys.argv if argv is None else argv) + run_analysis(args) + + +if __name__ == '__main__': + main() diff --git a/solutionbox/structured_data/datalab_solutions/structured_data/preprocess/local_preprocess.py b/solutionbox/structured_data/datalab_solutions/structured_data/preprocess/local_preprocess.py new file mode 100644 index 000000000..5a5d1a5a9 --- /dev/null +++ b/solutionbox/structured_data/datalab_solutions/structured_data/preprocess/local_preprocess.py @@ -0,0 +1,166 @@ +# Copyright 2017 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + + +import argparse +import collections +import json +import os +import sys + + +from tensorflow.python.lib.io import file_io + + +SCHEMA_FILE = 'schema.json' +NUMERICAL_ANALYSIS_FILE = 'numerical_analysis.json' +CATEGORICAL_ANALYSIS_FILE = 'vocab_%s.csv' + + +def parse_arguments(argv): + """Parse command line arguments. + + Args: + argv: list of command line arguments, includeing programe name. + + Returns: + An argparse Namespace object. + """ + parser = argparse.ArgumentParser( + description='Runs Preprocessing on structured CSV data.') + parser.add_argument('--input_file_pattern', + type=str, + required=True, + help='Input CSV file names. May contain a file pattern') + parser.add_argument('--output_dir', + type=str, + required=True, + help='Google Cloud Storage which to place outputs.') + parser.add_argument('--schema_file', + type=str, + required=True, + help=('BigQuery json schema file')) + + args = parser.parse_args(args=argv[1:]) + + # Make sure the output folder exists if local folder. + file_io.recursive_create_dir(args.output_dir) + + return args + + +def run_numerical_categorical_analysis(args, schema_list): + """Makes the numerical and categorical analysis files. + + Args: + args: the command line args + schema_list: python object of the schema json file. + + Raises: + ValueError: if schema contains unknown column types. + """ + header = [column['name'] for column in schema_list] + input_files = file_io.get_matching_files(args.input_file_pattern) + + # Check the schema is valid + for col_schema in schema_list: + col_type = col_schema['type'].lower() + if col_type != 'string' and col_type != 'integer' and col_type != 'float': + raise ValueError('Schema contains an unsupported type %s.' % col_type) + + # initialize the results + def _init_numerical_results(): + return {'min': float('inf'), + 'max': float('-inf'), + 'count': 0, + 'sum': 0.0} + numerical_results = collections.defaultdict(_init_numerical_results) + categorical_results = collections.defaultdict(set) + + # for each file, update the numerical stats from that file, and update the set + # of unique labels. + for input_file in input_files: + with file_io.FileIO(input_file, 'r') as f: + for line in f: + parsed_line = dict(zip(header, line.strip().split(','))) + + for col_schema in schema_list: + col_name = col_schema['name'] + col_type = col_schema['type'] + if col_type.lower() == 'string': + categorical_results[col_name].update([parsed_line[col_name]]) + else: + # numerical column. + + # if empty, skip + if not parsed_line[col_name].strip(): + continue; + + numerical_results[col_name]['min'] = ( + min(numerical_results[col_name]['min'], + float(parsed_line[col_name]))) + numerical_results[col_name]['max'] = ( + max(numerical_results[col_name]['max'], + float(parsed_line[col_name]))) + numerical_results[col_name]['count'] += 1 + numerical_results[col_name]['sum'] += float(parsed_line[col_name]) + + # Update numerical_results to just have min/min/mean + for col_schema in schema_list: + if col_schema['type'].lower() != 'string': + col_name = col_schema['name'] + mean = numerical_results[col_name]['sum'] / numerical_results[col_name]['count'] + del numerical_results[col_name]['sum'] + del numerical_results[col_name]['count'] + numerical_results[col_name]['mean'] = mean + + + # Write the numerical_results to a json file. + file_io.write_string_to_file( + os.path.join(args.output_dir, NUMERICAL_ANALYSIS_FILE), + json.dumps(numerical_results, indent=2, separators=(',', ': '))) + + # Write the vocab files. Each label is on its own line. + for name, unique_labels in categorical_results.iteritems(): + labels = '\n'.join(list(unique_labels)) + file_io.write_string_to_file( + os.path.join(args.output_dir, CATEGORICAL_ANALYSIS_FILE % name), + labels) + + +def run_analysis(args): + """Builds an analysis files for training.""" + + # Read the schema and input feature types + schema_list = json.loads(file_io.read_file_to_string(args.schema_file)) + + run_numerical_categorical_analysis(args, schema_list) + + # Also save a copy of the schema in the output folder. + file_io.copy(args.schema_file, + os.path.join(args.output_dir, SCHEMA_FILE), + overwrite=True) + + +def main(argv=None): + args = parse_arguments(sys.argv if argv is None else argv) + run_analysis(args) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/solutionbox/structured_data/datalab_solutions/structured_data/test/__init__.py b/solutionbox/structured_data/datalab_solutions/structured_data/test/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/solutionbox/structured_data/datalab_solutions/structured_data/test/e2e_functions.py b/solutionbox/structured_data/datalab_solutions/structured_data/test/e2e_functions.py new file mode 100644 index 000000000..f057df8e8 --- /dev/null +++ b/solutionbox/structured_data/datalab_solutions/structured_data/test/e2e_functions.py @@ -0,0 +1,208 @@ +# Copyright 2017 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + + +import os +import random +import json +import subprocess + + +def make_csv_data(filename, num_rows, problem_type, keep_target=True): + """Writes csv data for preprocessing and training. + + Args: + filename: writes data to csv file. + num_rows: how many rows of data will be generated. + problem_type: 'classification' or 'regression'. Changes the target value. + keep_target: if false, the csv file will have an empty column ',,' for the + target. + """ + random.seed(12321) + with open(filename, 'w') as f1: + for i in range(num_rows): + num1 = random.uniform(0, 30) + num2 = random.randint(0, 20) + num3 = random.uniform(0, 10) + + str1 = random.choice(['red', 'blue', 'green', 'pink', 'yellow', 'brown', 'black']) + str2 = random.choice(['abc', 'def', 'ghi', 'jkl', 'mno', 'pqr']) + str3 = random.choice(['car', 'truck', 'van', 'bike', 'train', 'drone']) + + map1 = {'red': 2, 'blue': 6, 'green': 4, 'pink': -5, 'yellow': -6, 'brown': -1, 'black': 7} + map2 = {'abc': 10, 'def': 1, 'ghi': 1, 'jkl': 1, 'mno': 1, 'pqr': 1} + map3 = {'car': 5, 'truck': 10, 'van': 15, 'bike': 20, 'train': 25, 'drone': 30} + + # Build some model. + t = 0.5 + 0.5*num1 -2.5*num2 + num3 + t += map1[str1] + map2[str2] + map3[str3] + + if problem_type == 'classification': + if t < 0: + t = 100 + elif t < 20: + t = 101 + else: + t = 102 + + if keep_target: + csv_line = "{id},{target},{num1},{num2},{num3},{str1},{str2},{str3}\n".format( + id=i, + target=t, + num1=num1, + num2=num2, + num3=num3, + str1=str1, + str2=str2, + str3=str3) + else: + csv_line = "{id},{num1},{num2},{num3},{str1},{str2},{str3}\n".format( + id=i, + num1=num1, + num2=num2, + num3=num3, + str1=str1, + str2=str2, + str3=str3) + f1.write(csv_line) + + +def make_preprocess_schema(filename, problem_type): + """Makes a schema file compatable with the output of make_csv_data. + + Writes a json file. + + Args: + filename: output file path + problem_type: regression or classification + """ + schema = [ + { + "mode": "NULLABLE", + "name": "key", + "type": "STRING" + }, + { + "mode": "REQUIRED", + "name": "target", + "type": ("STRING" if problem_type == 'classification' else "FLOAT") + }, + { + "mode": "NULLABLE", + "name": "num1", + "type": "FLOAT" + }, + { + "mode": "NULLABLE", + "name": "num2", + "type": "INTEGER" + }, + { + "mode": "NULLABLE", + "name": "num3", + "type": "FLOAT" + }, + { + "mode": "NULLABLE", + "name": "str1", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "str2", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "str3", + "type": "STRING" + } + ] + with open(filename, 'w') as f: + f.write(json.dumps(schema)) + + +def run_preprocess(output_dir, csv_filename, schema_filename): + preprocess_script = os.path.abspath( + os.path.join(os.path.dirname(__file__), + '../preprocess/local_preprocess.py')) + + cmd = ['python', preprocess_script, + '--output_dir', output_dir, + '--input_file_pattern', csv_filename, + '--schema_file', schema_filename + ] + print('Going to run command: %s' % ' '.join(cmd)) + subprocess.check_call(cmd) #, stderr=open(os.devnull, 'wb')) + + +def run_training( + train_data_paths, + eval_data_paths, + output_path, + preprocess_output_dir, + transforms_file, + max_steps, + model_type, + extra_args=[]): + """Runs Training via gcloud beta ml local train. + + Args: + train_data_paths: training csv files + eval_data_paths: eval csv files + output_path: folder to write output to + preprocess_output_dir: output location of preprocessing + transforms_file: path to transforms file + max_steps: max training steps + model_type: {dnn,linear}_{regression,classification} + extra_args: array of strings, passed to the trainer. + + Returns: + The stderr of training as one string. TF writes to stderr, so basically, the + output of training. + """ + + # Gcloud has the fun bug that you have to be in the parent folder of task.py + # when you call it. So cd there first. + task_parent_folder = os.path.abspath( + os.path.join(os.path.dirname(__file__), '..')) + cmd = ['cd %s &&' % task_parent_folder, + 'gcloud beta ml local train', + '--module-name=trainer.task', + '--package-path=trainer', + '--', + '--train_data_paths=%s' % train_data_paths, + '--eval_data_paths=%s' % eval_data_paths, + '--output_path=%s' % output_path, + '--preprocess_output_dir=%s' % preprocess_output_dir, + '--transforms_file=%s' % transforms_file, + '--model_type=%s' % model_type, + '--max_steps=%s' % max_steps] + extra_args + print('Going to run command: %s' % ' '.join(cmd)) + sp = subprocess.Popen(' '.join(cmd), shell=True, stderr=subprocess.PIPE) + _, err = sp.communicate() + return err + +if __name__ == '__main__': + make_csv_data('raw_train_regression.csv', 5000, 'regression', True) + make_csv_data('raw_eval_regression.csv', 1000, 'regression', True) + make_csv_data('raw_predict_regression.csv', 100, 'regression', False) + make_preprocess_schema('schema_regression.json', 'regression') + + make_csv_data('raw_train_classification.csv', 5000, 'classification', True) + make_csv_data('raw_eval_classification.csv', 1000, 'classification', True) + make_csv_data('raw_predict_classification.csv', 100, 'classification', False) + make_preprocess_schema('schema_classification.json', 'classification') + diff --git a/solutionbox/structured_data/datalab_solutions/structured_data/test/test_preprocess.py b/solutionbox/structured_data/datalab_solutions/structured_data/test/test_preprocess.py new file mode 100644 index 000000000..c1e0dfa80 --- /dev/null +++ b/solutionbox/structured_data/datalab_solutions/structured_data/test/test_preprocess.py @@ -0,0 +1,104 @@ +# Copyright 2017 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import glob +import json +import os +import shutil +import subprocess +import filecmp +import tempfile +import unittest + +import tensorflow as tf + +import e2e_functions + + +class TestPreprocess(unittest.TestCase): + + def setUp(self): + self._test_dir = tempfile.mkdtemp() + + self._csv_filename = os.path.join(self._test_dir, 'raw_csv_data.csv') + self._schema_filename = os.path.join(self._test_dir, 'schema.json') + + self._preprocess_output = os.path.join(self._test_dir, 'pout') + + def tearDown(self): + print('TestPreprocess: removing test dir: ' + self._test_dir) + shutil.rmtree(self._test_dir) + + + def _make_test_data(self, problem_type): + """Makes input files to run preprocessing on. + + Args: + problem_type: 'regression' or 'classification' + """ + e2e_functions.make_csv_data(self._csv_filename, 100, problem_type, True) + e2e_functions.make_preprocess_schema(self._schema_filename, problem_type) + + + def _test_preprocess(self, problem_type): + self._make_test_data(problem_type) + + e2e_functions.run_preprocess( + output_dir=self._preprocess_output, + csv_filename=self._csv_filename, + schema_filename=self._schema_filename) + + + schema_file = os.path.join(self._preprocess_output, 'schema.json') + numerical_analysis_file = os.path.join(self._preprocess_output, 'numerical_analysis.json') + + # test schema file was copied + self.assertTrue(filecmp.cmp(schema_file, self._schema_filename)) + + expected_numerical_keys = ['num1', 'num2', 'num3'] + if problem_type == 'regression': + expected_numerical_keys.append('target') + + # Load the numerical analysis file and check it has the right keys + with open(numerical_analysis_file, 'r') as f: + analysis = json.load(f) + self.assertEqual(sorted(expected_numerical_keys), sorted(analysis.keys())) + + # Check that the vocab files are made + expected_vocab_files = ['vocab_str1.csv', 'vocab_str2.csv', + 'vocab_str3.csv', 'vocab_key.csv'] + if problem_type == 'classification': + expected_vocab_files.append('vocab_target.csv') + + for name in expected_vocab_files: + vocab_file = os.path.join(self._preprocess_output, name) + self.assertTrue(os.path.exists(vocab_file)) + self.assertGreater(os.path.getsize(vocab_file), 0) + + all_expected_files = (expected_vocab_files + ['numerical_analysis.json', + 'schema.json']) + all_file_paths = glob.glob(os.path.join(self._preprocess_output, '*')) + all_files = [os.path.basename(path) for path in all_file_paths] + self.assertEqual(sorted(all_expected_files), sorted(all_files)) + + + def testRegression(self): + self._test_preprocess('regression') + + def testClassification(self): + self._test_preprocess('classification') + +if __name__ == '__main__': + unittest.main() diff --git a/solutionbox/structured_data/datalab_solutions/structured_data/test/test_trainer.py b/solutionbox/structured_data/datalab_solutions/structured_data/test/test_trainer.py new file mode 100644 index 000000000..6d2b7b7cd --- /dev/null +++ b/solutionbox/structured_data/datalab_solutions/structured_data/test/test_trainer.py @@ -0,0 +1,244 @@ +# Copyright 2017 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +import json +import os +import re +import shutil +import tempfile +import unittest + +import e2e_functions + + +class TestTrainer(unittest.TestCase): + """Tests training. + + Each test builds a csv test dataset. Preprocessing is run on the data to + produce analysis. Training is then ran, and the output is collected and the + accuracy/loss values are inspected. + """ + + def setUp(self): + self._test_dir = tempfile.mkdtemp() + self._preprocess_output = os.path.join(self._test_dir, 'pre') + self._train_output = os.path.join(self._test_dir, 'train') + + os.mkdir(self._preprocess_output) + os.mkdir(self._train_output) + + self._csv_train_filename = os.path.join(self._test_dir, 'train_csv_data.csv') + self._csv_eval_filename = os.path.join(self._test_dir, 'eval_csv_data.csv') + self._schema_filename = os.path.join(self._test_dir, 'schema.json') + self._input_features_filename = os.path.join(self._test_dir, + 'input_features_file.json') + + self._transforms_filename = os.path.join(self._test_dir, 'transforms.json') + + + def tearDown(self): + print('TestTrainer: removing test dir ' + self._test_dir) + shutil.rmtree(self._test_dir) + + + def _run_training(self, problem_type, model_type, transforms, extra_args=[]): + """Runs training. + + Output is saved to _training_screen_output. Nothing from training should be + printed to the screen. + + Args: + problem_type: 'regression' or 'classification' + model_type: 'linear' or 'dnn' + transform: JSON object of the transforms file. + extra_args: list of strings to pass to the trainer. + """ + # Run preprocessing. + e2e_functions.make_csv_data(self._csv_train_filename, 100, problem_type, True) + e2e_functions.make_csv_data(self._csv_eval_filename, 100, problem_type, True) + e2e_functions.make_preprocess_schema(self._schema_filename, problem_type) + + e2e_functions.run_preprocess( + output_dir=self._preprocess_output, + csv_filename=self._csv_train_filename, + schema_filename=self._schema_filename) + + # Write the transforms file. + with open(self._transforms_filename, 'w') as f: + f.write(json.dumps(transforms, indent=2, separators=(',', ': '))) + + # Run training and save the output. + output = e2e_functions.run_training( + train_data_paths=self._csv_train_filename, + eval_data_paths=self._csv_eval_filename, + output_path=self._train_output, + preprocess_output_dir=self._preprocess_output, + transforms_file=self._transforms_filename, + max_steps=2500, + model_type=model_type + '_' + problem_type, + extra_args=extra_args) + self._training_screen_output = output + #print(self._training_screen_output) + + + def _check_training_screen_output(self, accuracy=None, loss=None): + """Should be called after _run_training. + + Inspects self._training_screen_output for correct output. + + Args: + accuracy: float. Eval accuracy should be > than this number. + loss: flaot. Eval loss should be < than this number. + """ + # Print the last line of training output which has the loss value. + lines = self._training_screen_output.splitlines() + for line in lines: + if line.startswith('INFO:tensorflow:Saving dict for global step %s:' % 2500): + last_line = line + break + print(last_line) + + # supports positive numbers (int, real) with exponential form support. + positive_number_re = re.compile('[+]?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?') + + # Check it made it to step 2500 + saving_num_re = re.compile('global_step = \d+') + saving_num = saving_num_re.findall(last_line) + # saving_num == ['Saving evaluation summary for step NUM'] + self.assertEqual(len(saving_num), 1) + step_num = positive_number_re.findall(saving_num[0]) + # step_num == ['2500'] + self.assertEqual(len(step_num), 1) + self.assertEqual(int(step_num[0]), 2500) + + # Check the accuracy + if accuracy is not None: + accuracy_eq_num_re = re.compile('accuracy = [+]?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?') + accuracy_eq_num = accuracy_eq_num_re.findall(last_line) + # accuracy_eq_num == ['accuracy = NUM'] + self.assertEqual(len(accuracy_eq_num), 1) + accuracy_num = positive_number_re.findall(accuracy_eq_num[0]) + # accuracy_num == ['X.XXX'] + self.assertEqual(len(accuracy_num), 1) + self.assertGreater(float(accuracy_num[0]), accuracy) + + if loss is not None: + loss_eq_num_re = re.compile('loss = [+]?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?') + loss_eq_num = loss_eq_num_re.findall(last_line) + # loss_eq_num == ['loss = NUM'] + self.assertEqual(len(loss_eq_num), 1) + loss_num = positive_number_re.findall(loss_eq_num[0]) + # loss_num == ['X.XXX'] + self.assertEqual(len(loss_num), 1) + self.assertLess(float(loss_num[0]), loss) + + + def _check_train_files(self): + model_folder = os.path.join(self._train_output, + 'train/export/prediction_model') + self.assertTrue( + os.path.isfile(os.path.join(model_folder, 'saved_model.pb'))) + self.assertTrue( + os.path.isfile(os.path.join(model_folder, 'variables/variables.index'))) + self.assertTrue( + os.path.isfile(os.path.join(model_folder, 'assets.extra/schema.json'))) + self.assertTrue( + os.path.isfile(os.path.join(model_folder, 'assets.extra/transforms.json'))) + + + def testRegressionDnn(self): + print('\n\nTesting Regression DNN') + + transforms = { + "num1": {"transform": "scale"}, + "num2": {"transform": "scale","value": 4}, + "str1": {"transform": "hash_embedding", "embedding_dim": 2, "hash_bucket_size": 4}, + "str2": {"transform": "embedding", "embedding_dim": 3}, + "target": {"transform": "target"}, + "key": {"transform": "key"}, + } + + extra_args = ['--layer_size1=10', '--layer_size2=10', '--layer_size3=5'] + self._run_training(problem_type='regression', + model_type='dnn', + transforms=transforms, + extra_args=extra_args) + + self._check_training_screen_output(loss=20) + self._check_train_files() + + + def testRegressionLinear(self): + print('\n\nTesting Regression Linear') + + transforms = { + "num1": {"transform": "scale"}, + "num2": {"transform": "scale","value": 4}, + "str1": {"transform": "hash_sparse", "hash_bucket_size": 2}, + "str2": {"transform": "hash_sparse", "hash_bucket_size": 2}, + "str3": {"transform": "hash_sparse", "hash_bucket_size": 2}, + "target": {"transform": "target"}, + "key": {"transform": "key"}, + } + + self._run_training(problem_type='regression', + model_type='linear', + transforms=transforms) + + self._check_training_screen_output(loss=100) + self._check_train_files() + + + def testClassificationDnn(self): + print('\n\nTesting classification DNN') + + transforms = { + "num1": {"transform": "scale"}, + "num2": {"transform": "scale","value": 4}, + "str1": {"transform": "hash_one_hot", "hash_bucket_size": 4}, + "str2": {"transform": "one_hot"}, + "str3": {"transform": "embedding", "embedding_dim": 3}, + "target": {"transform": "target"}, + "key": {"transform": "key"} + } + + extra_args = ['--layer_size1=10', '--layer_size2=10', '--layer_size3=5'] + self._run_training(problem_type='classification', + model_type='dnn', + transforms=transforms, + extra_args=extra_args) + + self._check_training_screen_output(accuracy=0.95, loss=0.09) + self._check_train_files() + + + def testClassificationLinear(self): + print('\n\nTesting classification Linear') + + transforms = { + "num1": {"transform": "scale"}, + "num2": {"transform": "scale","value": 4}, + "str1": {"transform": "hash_sparse", "hash_bucket_size": 4}, + "str2": {"transform": "sparse"}, + "target": {"transform": "target"}, + "key": {"transform": "key"}, + } + + self._run_training(problem_type='classification', + model_type='linear', + transforms=transforms) + + self._check_training_screen_output(accuracy=0.90, loss=0.2) + self._check_train_files() + diff --git a/solutionbox/structured_data/datalab_solutions/structured_data/trainer/__init__.py b/solutionbox/structured_data/datalab_solutions/structured_data/trainer/__init__.py new file mode 100755 index 000000000..dc494405f --- /dev/null +++ b/solutionbox/structured_data/datalab_solutions/structured_data/trainer/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2017 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +import task diff --git a/solutionbox/structured_data/datalab_solutions/structured_data/trainer/task.py b/solutionbox/structured_data/datalab_solutions/structured_data/trainer/task.py new file mode 100755 index 000000000..5cfbd1ba3 --- /dev/null +++ b/solutionbox/structured_data/datalab_solutions/structured_data/trainer/task.py @@ -0,0 +1,269 @@ +# Copyright 2017 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import json +import os +import re +import sys +import math + +from . import util +import tensorflow as tf +from tensorflow.contrib import metrics as metrics_lib + +from tensorflow.contrib.learn.python.learn import learn_runner +from tensorflow.contrib.session_bundle import manifest_pb2 +from tensorflow.python.lib.io import file_io + + +def get_reader_input_fn(train_config, preprocess_output_dir, model_type, + data_paths, batch_size, shuffle, num_epochs=None): + """Builds input layer for training.""" + + def get_input_features(): + """Read the input features from the given data paths.""" + _, examples = util.read_examples( + input_files=data_paths, + batch_size=batch_size, + shuffle=shuffle, + num_epochs=num_epochs) + features = util.parse_example_tensor(examples=examples, + train_config=train_config, + keep_target=True) + + target_name = train_config['target_column'] + target = features.pop(target_name) + features, target = util.preprocess_input( + features=features, + target=target, + train_config=train_config, + preprocess_output_dir=preprocess_output_dir, + model_type=model_type) + + return features, target + + # Return a function to input the feaures into the model from a data path. + return get_input_features + + +def get_experiment_fn(args): + """Builds the experiment function for learn_runner.run. + + Args: + args: the command line args + + Returns: + A function that returns a tf.learn experiment object. + """ + + def get_experiment(output_dir): + # Merge schema, input features, and transforms. + train_config = util.merge_metadata(args.preprocess_output_dir, + args.transforms_file) + + # Get the model to train. + estimator = util.get_estimator(output_dir, train_config, args) + + # Save a copy of the scehma and input to the model folder. + schema_file = os.path.join(args.preprocess_output_dir, util.SCHEMA_FILE) + + # Make list of files to save with the trained model. + additional_assets = {'transforms.json': args.transforms_file, + util.SCHEMA_FILE: schema_file} + if util.is_classification_model(args.model_type): + target_name = train_config['target_column'] + vocab_file_name = util.CATEGORICAL_ANALYSIS % target_name + vocab_file_path = os.path.join( + args.preprocess_output_dir, vocab_file_name) + assert file_io.file_exists(vocab_file_path) + additional_assets[vocab_file_name] = vocab_file_path + + export_strategy_target = util.make_export_strategy( + train_config=train_config, + args=args, + keep_target=True, + assets_extra=additional_assets) + export_strategy_notarget = util.make_export_strategy( + train_config=train_config, + args=args, + keep_target=False, + assets_extra=additional_assets) + + input_reader_for_train = get_reader_input_fn( + train_config=train_config, + preprocess_output_dir=args.preprocess_output_dir, + model_type=args.model_type, + data_paths=args.train_data_paths, + batch_size=args.train_batch_size, + shuffle=True, + num_epochs=args.num_epochs) + + input_reader_for_eval = get_reader_input_fn( + train_config=train_config, + preprocess_output_dir=args.preprocess_output_dir, + model_type=args.model_type, + data_paths=args.eval_data_paths, + batch_size=args.eval_batch_size, + shuffle=False, + num_epochs=1) + + # Set the eval metrics. + # TODO(brandondutra): make this work with HP tuning. + if util.is_classification_model(args.model_type): + streaming_accuracy = metrics_lib.streaming_accuracy + eval_metrics = { + ('accuracy', 'classes'): streaming_accuracy, + # Export the accuracy as a metric for hyperparameter tuning. + #('training/hptuning/metric', 'classes'): streaming_accuracy + } + else: + eval_metrics = None + + return tf.contrib.learn.Experiment( + estimator=estimator, + train_input_fn=input_reader_for_train, + eval_input_fn=input_reader_for_eval, + train_steps=args.max_steps, + export_strategies=[export_strategy_target, export_strategy_notarget], + min_eval_frequency=args.min_eval_frequency, + ) + + # Return a function to create an Experiment. + return get_experiment + + +def parse_arguments(argv): + """Parse the command line arguments.""" + parser = argparse.ArgumentParser( + description=('Train a regression or classification model. Note that if ' + 'using a DNN model, --layer_size1=NUM, --layer_size2=NUM, ' + 'should be used. ')) + + # I/O file parameters + parser.add_argument('--train_data_paths', type=str, action='append', + required=True) + parser.add_argument('--eval_data_paths', type=str, action='append', + required=True) + parser.add_argument('--output_path', type=str, required=True) + parser.add_argument('--preprocess_output_dir', + type=str, + required=True, + help=('Output folder of preprocessing. Should contain the' + ' files input_features.json, schema.json, and the' + ' optional files numerical_analysis.json and' + ' vocab_str1.csv. Path must be on GCS if running' + ' cloud training.')) + parser.add_argument('--transforms_file', + type=str, + required=True, + help=('File describing the the transforms to apply on ' + 'each column')) + + # HP parameters + parser.add_argument('--learning_rate', type=float, default=0.01, + help='tf.train.AdamOptimizer learning rate') + parser.add_argument('--epsilon', type=float, default=0.0005, + help='tf.train.AdamOptimizer epsilon') + # --layer_size See below + + # Model problems + parser.add_argument('--model_type', + choices=['linear_classification', 'linear_regression', + 'dnn_classification', 'dnn_regression'], + required=True) + parser.add_argument('--top_n', + type=int, + default=1, + help=('For classification problems, the output graph ' + 'will contain the labels and scores for the top ' + 'n classes.')) + # Training input parameters + parser.add_argument('--max_steps', type=int, default=5000, + help='Maximum number of training steps to perform.') + parser.add_argument('--num_epochs', + type=int, + help=('Maximum number of training data epochs on which ' + 'to train. If both --max-steps and --num-epochs ' + 'are specified, the training job will run for ' + '--max-steps or --num-epochs, whichever occurs ' + 'first. If unspecified will run for --max-steps.')) + parser.add_argument('--train_batch_size', type=int, default=1000) + parser.add_argument('--eval_batch_size', type=int, default=1000) + parser.add_argument('--min_eval_frequency', type=int, default=100, + help=('Minimum number of training steps between ' + 'evaluations')) + + # Training output parameters + parser.add_argument('--save_checkpoints_secs', type=int, default=600, + help=('How often the model should be checkpointed/saved ' + 'in seconds')) + + args, remaining_args = parser.parse_known_args(args=argv[1:]) + + # All HP parambeters must be unique, so we need to support an unknown number + # of --layer_size1=10 --layer_size2=10 ... + # Look at remaining_args for layer_size\d+ to get the layer info. + + # Get number of layers + pattern = re.compile('layer_size(\d+)') + num_layers = 0 + for other_arg in remaining_args: + match = re.search(pattern, other_arg) + if match: + num_layers = max(num_layers, int(match.group(1))) + + # Build a new parser so we catch unknown args and missing layer_sizes. + parser = argparse.ArgumentParser() + for i in range(num_layers): + parser.add_argument('--layer_size%s' % str(i+1), type=int, required=True) + + layer_args = vars(parser.parse_args(args=remaining_args)) + layer_sizes = [] + for i in range(num_layers): + key = 'layer_size%s' % str(i+1) + layer_sizes.append(layer_args[key]) + + assert len(layer_sizes) == num_layers + args.layer_sizes = layer_sizes + + return args + + +def main(argv=None): + """Run a Tensorflow model on the Iris dataset.""" + args = parse_arguments(sys.argv if argv is None else argv) + + env = json.loads(os.environ.get('TF_CONFIG', '{}')) + # First find out if there's a task value on the environment variable. + # If there is none or it is empty define a default one. + task_data = env.get('task') or {'type': 'master', 'index': 0} + + trial = task_data.get('trial') + if trial is not None: + output_dir = os.path.join(args.output_path, trial) + else: + output_dir = args.output_path + + learn_runner.run( + experiment_fn=get_experiment_fn(args), + output_dir=output_dir) + + +if __name__ == '__main__': + tf.logging.set_verbosity(tf.logging.INFO) + main() diff --git a/solutionbox/structured_data/datalab_solutions/structured_data/trainer/util.py b/solutionbox/structured_data/datalab_solutions/structured_data/trainer/util.py new file mode 100755 index 000000000..316d98b9d --- /dev/null +++ b/solutionbox/structured_data/datalab_solutions/structured_data/trainer/util.py @@ -0,0 +1,861 @@ +# Copyright 2017 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import json +import multiprocessing +import os +import math +from StringIO import StringIO + +import tensorflow as tf +from tensorflow.python.lib.io import file_io + +from tensorflow.contrib.learn.python.learn.utils import input_fn_utils +from tensorflow.contrib.learn.python.learn import export_strategy +from tensorflow.contrib.learn.python.learn.utils import ( + saved_model_export_utils) + +from tensorflow.python.ops import variables +from tensorflow.contrib.framework.python.ops import variables as contrib_variables +from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib +from tensorflow.python.training import saver +from tensorflow.python.ops import data_flow_ops +from tensorflow.python.framework import ops +from tensorflow.python.client import session as tf_session +from tensorflow.python.saved_model import builder as saved_model_builder +from tensorflow.python.saved_model import tag_constants +from tensorflow.python.ops import control_flow_ops +from tensorflow.python.util import compat +from tensorflow.python.platform import gfile +from tensorflow.python.saved_model import signature_def_utils + + +SCHEMA_FILE = 'schema.json' +NUMERICAL_ANALYSIS = 'numerical_analysis.json' +CATEGORICAL_ANALYSIS = 'vocab_%s.csv' + + +# Constants for the Prediction Graph fetch tensors. +PG_TARGET = 'target_from_input' + +PG_REGRESSION_PREDICTED_TARGET = 'predicted_target' +PG_CLASSIFICATION_LABEL_TEMPLATE = 'top_%s_label' +PG_CLASSIFICATION_SCORE_TEMPLATE = 'top_%s_score' + +# ============================================================================== +# Exporting the last trained model to a final location +# ============================================================================== + + +def _copy_all(src_files, dest_dir): + # file_io.copy does not copy files into folders directly. + for src_file in src_files: + file_name = os.path.basename(src_file) + new_file_location = os.path.join(dest_dir, file_name) + file_io.copy(src_file, new_file_location, overwrite=True) + + +def _recursive_copy(src_dir, dest_dir): + """Copy the contents of src_dir into the folder dest_dir. + Args: + src_dir: gsc or local path. + dest_dir: gcs or local path. + When called, dest_dir should exist. + """ + file_io.recursive_create_dir(dest_dir) + for file_name in file_io.list_directory(src_dir): + old_path = os.path.join(src_dir, file_name) + new_path = os.path.join(dest_dir, file_name) + + if file_io.is_directory(old_path): + _recursive_copy(old_path, new_path) + else: + file_io.copy(old_path, new_path, overwrite=True) + +def serving_from_csv_input(train_config, args, keep_target): + """Read the input features from a placeholder csv string tensor.""" + examples = tf.placeholder( + dtype=tf.string, + shape=(None,), + name='csv_input_string') + + features = parse_example_tensor(examples=examples, + train_config=train_config, + keep_target=keep_target) + + if keep_target: + target = features.pop(train_config['target_column']) + else: + target = None + features, target = preprocess_input( + features=features, + target=target, + train_config=train_config, + preprocess_output_dir=args.preprocess_output_dir, + model_type=args.model_type) + + return input_fn_utils.InputFnOps(features, + target, + {'csv_line': examples} + ) + + +def make_output_tensors(train_config, args, input_ops, model_fn_ops, keep_target=True): + target_name = train_config['target_column'] + key_name = train_config['key_column'] + + outputs = {} + outputs[key_name] = tf.squeeze(input_ops.features[key_name]) + + if is_classification_model(args.model_type): + + # build maps from ints to the origional categorical strings. + string_value = get_vocabulary(args.preprocess_output_dir, target_name) + table = tf.contrib.lookup.index_to_string_table_from_tensor( + mapping=string_value, + default_value='UNKNOWN') + + # Get the label of the input target. + if keep_target: + input_target_label = table.lookup(input_ops.labels) + outputs[PG_TARGET] = tf.squeeze(input_target_label) + + # TODO(brandondutra): get the score of the target label too. + probabilities = model_fn_ops.predictions['probabilities'] + + # get top k labels and their scores. + (top_k_values, top_k_indices) = tf.nn.top_k(probabilities, k=args.top_n) + top_k_labels = table.lookup(tf.to_int64(top_k_indices)) + + # Write the top_k values using 2*top_k columns. + num_digits = int(math.ceil(math.log(args.top_n, 10))) + if num_digits == 0: + num_digits = 1 + for i in range(0, args.top_n): + # Pad i based on the size of k. So if k = 100, i = 23 -> i = '023'. This + # makes sorting the columns easy. + padded_i = str(i+1).zfill(num_digits) + + label_alias = PG_CLASSIFICATION_LABEL_TEMPLATE % padded_i + label_tensor_name = (tf.squeeze( + tf.slice(top_k_labels, + [0, i], + [tf.shape(top_k_labels)[0], 1]))) + score_alias = PG_CLASSIFICATION_SCORE_TEMPLATE % padded_i + score_tensor_name = (tf.squeeze( + tf.slice(top_k_values, + [0, i], + [tf.shape(top_k_values)[0], 1]))) + + outputs.update({label_alias: label_tensor_name, + score_alias: score_tensor_name}) + + else: + if keep_target: + outputs[PG_TARGET] = tf.squeeze(input_ops.labels) + + scores = model_fn_ops.predictions['scores'] + outputs[PG_REGRESSION_PREDICTED_TARGET] = tf.squeeze(scores) + + return outputs + + +def make_export_strategy(train_config, args, keep_target, assets_extra=None): + def export_fn(estimator, export_dir_base, checkpoint_path=None, eval_result=None): + with ops.Graph().as_default() as g: + contrib_variables.create_global_step(g) + + input_ops = serving_from_csv_input(train_config, args, keep_target) + model_fn_ops = estimator._call_model_fn(input_ops.features, + None, + model_fn_lib.ModeKeys.INFER) + output_fetch_tensors = make_output_tensors( + train_config=train_config, + args=args, + input_ops=input_ops, + model_fn_ops=model_fn_ops, + keep_target=keep_target) + + signature_def_map = { + 'serving_default': + signature_def_utils.predict_signature_def( + input_ops.default_inputs, + output_fetch_tensors) + } + + if not checkpoint_path: + # Locate the latest checkpoint + checkpoint_path = saver.latest_checkpoint(estimator._model_dir) + if not checkpoint_path: + raise NotFittedError("Couldn't find trained model at %s." + % estimator._model_dir) + + export_dir = saved_model_export_utils.get_timestamped_export_dir( + export_dir_base) + + with tf_session.Session('') as session: + #variables.initialize_local_variables() + variables.local_variables_initializer() + data_flow_ops.tables_initializer() + saver_for_restore = saver.Saver( + variables.global_variables(), + sharded=True) + saver_for_restore.restore(session, checkpoint_path) + + init_op = control_flow_ops.group( + variables.local_variables_initializer(), + data_flow_ops.tables_initializer()) + + # Perform the export + builder = saved_model_builder.SavedModelBuilder(export_dir) + builder.add_meta_graph_and_variables( + session, [tag_constants.SERVING], + signature_def_map=signature_def_map, + assets_collection=ops.get_collection( + ops.GraphKeys.ASSET_FILEPATHS), + legacy_init_op=init_op) + builder.save(False) + + # Add the extra assets + if assets_extra: + assets_extra_path = os.path.join(compat.as_bytes(export_dir), + compat.as_bytes('assets.extra')) + for dest_relative, source in assets_extra.items(): + dest_absolute = os.path.join(compat.as_bytes(assets_extra_path), + compat.as_bytes(dest_relative)) + dest_path = os.path.dirname(dest_absolute) + gfile.MakeDirs(dest_path) + gfile.Copy(source, dest_absolute) + + # only keep the last 3 models + saved_model_export_utils.garbage_collect_exports(export_dir_base, exports_to_keep=3) + + # save the last model to the model folder. + # export_dir_base = A/B/intermediate_models/ + if keep_target: + final_dir = os.path.join(args.output_path, 'evaluation_model') + else: + final_dir = os.path.join(args.output_path, 'model') + if file_io.is_directory(final_dir): + file_io.delete_recursively(final_dir) + file_io.recursive_create_dir(final_dir) + _recursive_copy(export_dir, final_dir) + + + return export_dir + + if keep_target: + intermediate_dir = 'intermediate_evaluation_models' + else: + intermediate_dir = 'intermediate_prediction_models' + + return export_strategy.ExportStrategy(intermediate_dir, export_fn) + + +# ============================================================================== +# Reading the input csv files and parsing its output into tensors. +# ============================================================================== + + +def parse_example_tensor(examples, train_config, keep_target): + """Read the csv files. + + Args: + examples: string tensor + train_config: training config + keep_target: if true, the target column is expected to exist and it is + returned in the features dict. + + Returns: + Dict of feature_name to tensor. Target feature is in the dict. + """ + + csv_header = [] + if keep_target: + csv_header = train_config['csv_header'] + else: + csv_header = [name for name in train_config['csv_header'] + if name != train_config['target_column']] + + # record_defaults are used by tf.decode_csv to insert defaults, and to infer + # the datatype. + record_defaults = [[train_config['csv_defaults'][name]] + for name in csv_header] + tensors = tf.decode_csv(examples, record_defaults, name='csv_to_tensors') + + # I'm not really sure why expand_dims needs to be called. If using regression + # models, it errors without it. + tensors = [tf.expand_dims(x, axis=1) for x in tensors] + + tensor_dict = dict(zip(csv_header, tensors)) + return tensor_dict + + +def read_examples(input_files, batch_size, shuffle, num_epochs=None): + """Creates readers and queues for reading example protos.""" + files = [] + for e in input_files: + for path in e.split(','): + files.extend(file_io.get_matching_files(path)) + thread_count = multiprocessing.cpu_count() + + # The minimum number of instances in a queue from which examples are drawn + # randomly. The larger this number, the more randomness at the expense of + # higher memory requirements. + min_after_dequeue = 1000 + + # When batching data, the queue's capacity will be larger than the batch_size + # by some factor. The recommended formula is (num_threads + a small safety + # margin). For now, we use a single thread for reading, so this can be small. + queue_size_multiplier = thread_count + 3 + + # Convert num_epochs == 0 -> num_epochs is None, if necessary + num_epochs = num_epochs or None + + # Build a queue of the filenames to be read. + filename_queue = tf.train.string_input_producer(files, num_epochs, shuffle) + + example_id, encoded_example = tf.TextLineReader().read_up_to( + filename_queue, batch_size) + + if shuffle: + capacity = min_after_dequeue + queue_size_multiplier * batch_size + return tf.train.shuffle_batch( + [example_id, encoded_example], + batch_size, + capacity, + min_after_dequeue, + enqueue_many=True, + num_threads=thread_count) + + else: + capacity = queue_size_multiplier * batch_size + return tf.train.batch( + [example_id, encoded_example], + batch_size, + capacity=capacity, + enqueue_many=True, + num_threads=thread_count) + + +# ============================================================================== +# Building the TF learn estimators +# ============================================================================== + + +def get_estimator(output_dir, train_config, args): + """Returns a tf learn estimator. + + We only support {DNN, Linear}Regressor and {DNN, Linear}Classifier. This is + controlled by the values of model_type in the args. + + Args: + output_dir: Modes are saved into outputdir/train + train_config: our training config + args: command line parameters + + Returns: + TF lean estimator + + Raises: + ValueError: if config is wrong. + """ + + # Check the requested mode fits the preprocessed data. + target_name = train_config['target_column'] + if (is_classification_model(args.model_type) and + target_name not in train_config['categorical_columns']): + raise ValueError('When using a classification model, the target must be a ' + 'categorical variable.') + if (is_regression_model(args.model_type) and + target_name not in train_config['numerical_columns']): + raise ValueError('When using a regression model, the target must be a ' + 'numerical variable.') + + # Check layers used for dnn models. + if is_dnn_model(args.model_type) and not args.layer_sizes: + raise ValueError('--layer_size* must be used with DNN models') + if is_linear_model(args.model_type) and args.layer_sizes: + raise ValueError('--layer_size* cannot be used with linear models') + + # Build tf.learn features + feature_columns = _tflearn_features(train_config, args) + + # Set how often to run checkpointing in terms of time. + config = tf.contrib.learn.RunConfig( + save_checkpoints_secs=args.save_checkpoints_secs) + + train_dir = os.path.join(output_dir, 'train') + if args.model_type == 'dnn_regression': + estimator = tf.contrib.learn.DNNRegressor( + feature_columns=feature_columns, + hidden_units=args.layer_sizes, + config=config, + model_dir=train_dir, + optimizer=tf.train.AdamOptimizer( + args.learning_rate, epsilon=args.epsilon)) + elif args.model_type == 'linear_regression': + estimator = tf.contrib.learn.LinearRegressor( + feature_columns=feature_columns, + config=config, + model_dir=train_dir, + optimizer=tf.train.AdamOptimizer( + args.learning_rate, epsilon=args.epsilon)) + elif args.model_type == 'dnn_classification': + estimator = tf.contrib.learn.DNNClassifier( + feature_columns=feature_columns, + hidden_units=args.layer_sizes, + n_classes=train_config['vocab_stats'][target_name]['n_classes'], + config=config, + model_dir=train_dir, + optimizer=tf.train.AdamOptimizer( + args.learning_rate, epsilon=args.epsilon)) + elif args.model_type == 'linear_classification': + estimator = tf.contrib.learn.LinearClassifier( + feature_columns=feature_columns, + n_classes=train_config['vocab_stats'][target_name]['n_classes'], + config=config, + model_dir=train_dir, + optimizer=tf.train.AdamOptimizer( + args.learning_rate, epsilon=args.epsilon)) + else: + raise ValueError('bad --model_type value') + + return estimator + + +def preprocess_input(features, target, train_config, preprocess_output_dir, + model_type): + """Perform some transformations after reading in the input tensors. + + Args: + features: dict of feature_name to tensor + target: tensor + train_config: our training config object + preprocess_output_dir: folder should contain the vocab files. + model_type: the tf model type. + + Raises: + ValueError: if wrong transforms are used + + Returns: + New features dict and new target tensor. + """ + + target_name = train_config['target_column'] + key_name = train_config['key_column'] + + # Do the numerical transforms. + # Numerical transforms supported for regression/classification + # 1) num -> do nothing (identity, default) + # 2) num -> scale to -1, 1 (scale) + # 3) num -> scale to -a, a (scale with value parameter) + with tf.name_scope('numerical_feature_preprocess') as scope: + if train_config['numerical_columns']: + numerical_analysis_file = os.path.join(preprocess_output_dir, + NUMERICAL_ANALYSIS) + if not file_io.file_exists(numerical_analysis_file): + raise ValueError('File %s not found in %s' % + (NUMERICAL_ANALYSIS, preprocess_output_dir)) + + numerical_anlysis = json.loads( + file_io.read_file_to_string(numerical_analysis_file)) + + for name in train_config['numerical_columns']: + if name == target_name or name == key_name: + continue + + transform_config = train_config['transforms'].get(name, {}) + transform_name = transform_config.get('transform', None) + if transform_name == 'scale': + value = float(transform_config.get('value', 1.0)) + features[name] = _scale_tensor( + features[name], + range_min=numerical_anlysis[name]['min'], + range_max=numerical_anlysis[name]['max'], + scale_min=-value, + scale_max=value) + elif transform_name == 'identity' or transform_name is None: + pass + else: + raise ValueError(('For numerical variables, only scale ' + 'and identity are supported: ' + 'Error for %s') % name) + + # Do target transform if it exists. + if target is not None: + with tf.name_scope('target_feature_preprocess') as scope: + if target_name in train_config['categorical_columns']: + labels = train_config['vocab_stats'][target_name]['labels'] + table = tf.contrib.lookup.string_to_index_table_from_tensor(labels) + target = table.lookup(target) + #target = tf.contrib.lookup.string_to_index(target, labels) + + # Do categorical transforms. Only apply vocab mapping. The real + # transforms are done with tf learn column features. + with tf.name_scope('categorical_feature_preprocess') as scope: + for name in train_config['categorical_columns']: + if name == key_name or name == target_name: + continue + transform_config = train_config['transforms'].get(name, {}) + transform_name = transform_config.get('transform', None) + + # Supported transforms: + # for DNN + # 1) string -> hash -> embedding (hash_embedding) + # 2) string -> make int -> embedding (embedding) + # 3) string -> hash -> one_hot (hash_one_hot) + # 4) string -> make int -> one_hot (one_hot, default) + # for linear + # 1) string -> make int -> sparse_column_with_integerized_feature (sparse, default) + # 2) string -> sparse_column_with_hash_bucket (hash_sparse) + if is_dnn_model(model_type): + if (transform_name == 'hash_embedding' or + transform_name == 'hash_one_hot'): + map_vocab = False + elif (transform_name == 'embedding' or + transform_name == 'one_hot' or + transform_name is None): + map_vocab = True + else: + raise ValueError('For DNN modles, only hash_embedding, ' + 'hash_one_hot, embedding, and one_hot transforms ' + 'are supported.') + elif is_linear_model(model_type): + if (transform_name == 'sparse' or + transform_name is None): + map_vocab = True + elif transform_name == 'hash_sparse': + map_vocab = False + else: + raise ValueError('For linear models, only sparse and ' + 'hash_sparse are supported.') + if map_vocab: + labels = train_config['vocab_stats'][name]['labels'] + table = tf.contrib.lookup.string_to_index_table_from_tensor(labels) + features[name] = table.lookup(features[name]) + + return features, target + + +def _scale_tensor(tensor, range_min, range_max, scale_min, scale_max): + """Scale a tensor to scale_min to scale_max. + + Args: + tensor: input tensor. Should be a numerical tensor. + range_min: min expected value for this feature/tensor. + range_max: max expected Value. + scale_min: new expected min value. + scale_max: new expected max value. + + Returns: + scaled tensor. + """ + if range_min == range_max: + return tensor + + float_tensor = tf.to_float(tensor) + scaled_tensor = tf.divide( + (tf.subtract(float_tensor, range_min) + * tf.constant(float(scale_max - scale_min))), + tf.constant(float(range_max - range_min))) + shifted_tensor = scaled_tensor + tf.constant(float(scale_min)) + + return shifted_tensor + + +def _tflearn_features(train_config, args): + """Builds the tf.learn feature list. + + All numerical features are just given real_valued_column because all the + preprocessing transformations are done in preprocess_input. Categoriacl + features are processed here depending if the vocab map (from string to int) + was applied in preprocess_input. + + Args: + train_config: our train config object + args: command line args. + + Returns: + List of TF lean feature columns. + + Raises: + ValueError: if wrong transforms are used for the model type. + """ + feature_columns = [] + target_name = train_config['target_column'] + key_name = train_config['key_column'] + + for name in train_config['numerical_columns']: + if name != target_name and name != key_name: + feature_columns.append(tf.contrib.layers.real_valued_column( + name, + dimension=1)) + + for name in train_config['categorical_columns']: + if name != target_name and name != key_name: + transform_config = train_config['transforms'].get(name, {}) + transform_name = transform_config.get('transform', None) + + if is_dnn_model(args.model_type): + if transform_name == 'hash_embedding': + sparse = tf.contrib.layers.sparse_column_with_hash_bucket( + name, + hash_bucket_size=transform_config['hash_bucket_size']) + learn_feature = tf.contrib.layers.embedding_column( + sparse, + dimension=transform_config['embedding_dim']) + elif transform_name == 'hash_one_hot': + sparse = tf.contrib.layers.sparse_column_with_hash_bucket( + name, + hash_bucket_size=transform_config['hash_bucket_size']) + learn_feature = tf.contrib.layers.embedding_column( + sparse, + dimension=train_config['vocab_stats'][name]['n_classes']) + elif transform_name == 'embedding': + sparse = tf.contrib.layers.sparse_column_with_integerized_feature( + name, + bucket_size=train_config['vocab_stats'][name]['n_classes']) + learn_feature = tf.contrib.layers.embedding_column( + sparse, + dimension=transform_config['embedding_dim']) + elif transform_name == 'one_hot' or transform_name is None: + sparse = tf.contrib.layers.sparse_column_with_integerized_feature( + name, + bucket_size=train_config['vocab_stats'][name]['n_classes']) + learn_feature = tf.contrib.layers.one_hot_column(sparse) + else: + raise ValueError('For DNN modles, only hash_embedding, ' + 'hash_one_hot, embedding, and one_hot transforms ' + 'are supported.') + elif is_linear_model(args.model_type): + if transform_name == 'sparse' or transform_name is None: + learn_feature = tf.contrib.layers.sparse_column_with_integerized_feature( + name, + bucket_size=train_config['vocab_stats'][name]['n_classes']) + elif transform_name == 'hash_sparse': + learn_feature = tf.contrib.layers.sparse_column_with_hash_bucket( + name, + hash_bucket_size=transform_config['hash_bucket_size']) + else: + raise ValueError('For linear models, only sparse and ' + 'hash_sparse are supported.') + + # Save the feature + feature_columns.append(learn_feature) + return feature_columns + + +# ============================================================================== +# Building the TF learn estimators +# ============================================================================== + + +def get_vocabulary(preprocess_output_dir, name): + """Loads the vocabulary file as a list of strings. + + Args: + preprocess_output_dir: Should contain the file CATEGORICAL_ANALYSIS % name. + name: name of the csv column. + + Returns: + List of strings. + + Raises: + ValueError: if file is missing. + """ + vocab_file = os.path.join(preprocess_output_dir, CATEGORICAL_ANALYSIS % name) + if not file_io.file_exists(vocab_file): + raise ValueError('File %s not found in %s' % + (CATEGORICAL_ANALYSIS % name, preprocess_output_dir)) + + labels = file_io.read_file_to_string(vocab_file).split('\n') + label_values = [x for x in labels if x] # remove empty lines + + return label_values + + +def merge_metadata(preprocess_output_dir, transforms_file): + """Merge schema, analysis, and transforms files into one python object. + + Args: + preprocess_output_dir: the output folder of preprocessing. Should contain + the schema, and the numerical and categorical + analysis files. + transforms_file: the training transforms file. + + Returns: + A dict in the form + { + csv_header: [name1, name2, ...], + csv_defaults: {name1: value, name2: value}, + key_column: name, + target_column: name, + categorical_columns: [] + numerical_columns: [] + transforms: { name1: {transform: scale, value: 2}, + name2: {transform: embedding, dim: 50}, ... + } + vocab_stats: { name3: {n_classes: 23, labels: ['1', '2', ..., '23']}, + name4: {n_classes: 102, labels: ['red', 'blue', ...]}} + } + + Raises: + ValueError: if one of the input metadata files is wrong. + """ + numerical_anlysis_file = os.path.join(preprocess_output_dir, + NUMERICAL_ANALYSIS) + schema_file = os.path.join(preprocess_output_dir, SCHEMA_FILE) + + numerical_anlysis = json.loads(file_io.read_file_to_string( + numerical_anlysis_file)) + schema = json.loads(file_io.read_file_to_string(schema_file)) + transforms = json.loads(file_io.read_file_to_string(transforms_file)) + + result_dict = {} + result_dict['csv_header'] = [col_schema['name'] for col_schema in schema] + result_dict['key_column'] = None + result_dict['target_column'] = None + result_dict['categorical_columns'] = [] + result_dict['numerical_columns'] = [] + result_dict['transforms'] = {} + result_dict['csv_defaults'] = {} + result_dict['vocab_stats'] = {} + + # get key column. + for name, trans_config in transforms.iteritems(): + if trans_config.get('transform', None) == 'key': + result_dict['key_column'] = name + break + if result_dict['key_column'] is None: + raise ValueError('Key transform missing form transfroms file.') + + # get target column. + result_dict['target_column'] = schema[0]['name'] + for name, trans_config in transforms.iteritems(): + if trans_config.get('transform', None) == 'target': + result_dict['target_column'] = name + break + if result_dict['target_column'] is None: + raise ValueError('Target transform missing from transforms file.') + + # Get the numerical/categorical columns. + for col_schema in schema: + col_name = col_schema['name'] + col_type = col_schema['type'].lower() + if col_name == result_dict['key_column']: + continue + + if col_type == 'string': + result_dict['categorical_columns'].append(col_name) + elif col_type == 'integer' or col_type == 'float': + result_dict['numerical_columns'].append(col_name) + else: + raise ValueError('Unsupported schema type %s' % col_type) + + # Get the transforms. + for name, trans_config in transforms.iteritems(): + if name != result_dict['target_column'] and name != result_dict['key_column']: + result_dict['transforms'][name] = trans_config + + # Get the vocab_stats + for name in result_dict['categorical_columns']: + if name == result_dict['key_column']: + continue + + label_values = get_vocabulary(preprocess_output_dir, name) + if name != result_dict['target_column'] and '' not in label_values: + label_values.append('') # append a 'missing' label. + n_classes = len(label_values) + result_dict['vocab_stats'][name] = {'n_classes': n_classes, + 'labels': label_values} + + # Get the csv_defaults + for col_schema in schema: + name = col_schema['name'] + col_type = col_schema['type'].lower() + default = transforms.get(name, {}).get('default', None) + + if name == result_dict['target_column']: + if name in result_dict['numerical_columns']: + default = float(default or 0.0) + else: + default = default or '' + elif name == result_dict['key_column']: + if col_type == 'string': + default = str(default or '') + elif col_type == 'float': + default = float(default or 0.0) + else: + default = int(default or 0) + else: + if col_type == 'string': + default = str(default or '') + if default not in result_dict['vocab_stats'][name]['labels']: + raise ValueError('Default %s is not in the vocab for %s' % + (default, name)) + else: + default = float(default or numerical_anlysis[name]['mean']) + + result_dict['csv_defaults'][name] = default + + validate_metadata(result_dict) + return result_dict + + +def validate_metadata(train_config): + """Perform some checks that the trainig config is correct. + + Args: + train_config: train config as produced by merge_metadata() + + Raises: + ValueError: if columns look wrong. + """ + + # Make sure we have a default for every column + if len(train_config['csv_header']) != len(train_config['csv_defaults']): + raise ValueError('Unequal number of columns in input features file and ' + 'schema file.') + + # Check there are no missing columns. sorted_colums has two copies of the + # target column because the target column is also listed in + # categorical_columns or numerical_columns. + sorted_columns = sorted(train_config['csv_header'] + + [train_config['target_column']]) + + sorted_columns2 = sorted(train_config['categorical_columns'] + + train_config['numerical_columns'] + + [train_config['key_column']] + + [train_config['target_column']]) + if sorted_columns2 != sorted_columns: + raise ValueError('Each csv header must be a numerical/categorical type, a ' + ' key, or a target.') + + +def is_linear_model(model_type): + return model_type.startswith('linear_') + + +def is_dnn_model(model_type): + return model_type.startswith('dnn_') + + +def is_regression_model(model_type): + return model_type.endswith('_regression') + + +def is_classification_model(model_type): + return model_type.endswith('_classification') diff --git a/solutionbox/structured_data/setup.py b/solutionbox/structured_data/setup.py new file mode 100644 index 000000000..aee640276 --- /dev/null +++ b/solutionbox/structured_data/setup.py @@ -0,0 +1,73 @@ +# Copyright 2017 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. + +# A copy of this file must be made in datalab_solutions/structured_data/setup.py + +import datetime +import os +import re +from setuptools import setup + + + +# The version is saved in an __init__ file. +def get_version(): + VERSIONFILE = os.path.join('datalab_solutions/structured_data/', + '__init__.py') + if not os.path.isfile(VERSIONFILE): + raise ValueError('setup.py: File not found %s' % VERSIONFILE) + initfile_lines = open(VERSIONFILE, 'rt').readlines() + VSRE = r"^__version__ = ['\"]([^'\"]*)['\"]" + for line in initfile_lines: + mo = re.search(VSRE, line, re.M) + if mo: + return mo.group(1) + raise RuntimeError('Unable to find version string in %s.' % (VERSIONFILE,)) + + +setup( + name='structured_data', + version=get_version(), + packages=[ + 'datalab_solutions', + 'datalab_solutions.structured_data', + 'datalab_solutions.structured_data.trainer', + 'datalab_solutions.structured_data.preprocess', + 'datalab_solutions.structured_data.predict', + ], + description='Google Cloud Datalab Structured Data Package', + author='Google', + author_email='google-cloud-datalab-feedback@googlegroups.com', + keywords=[ + ], + license="Apache Software License", + classifiers=[ + "Programming Language :: Python", + "Programming Language :: Python :: 2", + "Development Status :: 4 - Beta", + "Environment :: Other Environment", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + "Topic :: Software Development :: Libraries :: Python Modules" + ], + long_description=""" + """, + install_requires=[ + 'tensorflow==1.0', + 'protobuf==3.1.0', + 'google-cloud-dataflow==0.5.5' + ], + package_data={ + }, + data_files=[], +)