From c4209ca82e4f72411f69f4ff1a4604fa15927266 Mon Sep 17 00:00:00 2001
From: Ian Huston <ihuston@pivotal.io>
Date: Wed, 17 Jun 2015 17:48:25 +0100
Subject: [PATCH] Add final example of a learning API.

---
 04-learning-api/LICENSE.txt              |  27 ++++
 04-learning-api/README.md                |  60 ++++++++
 04-learning-api/environment.yml          |  22 +++
 04-learning-api/main.py                  | 178 +++++++++++++++++++++++
 04-learning-api/manifest.yml             |  12 ++
 04-learning-api/models/ModelFactory.py   |  80 ++++++++++
 04-learning-api/models/StandardModels.py |  31 ++++
 04-learning-api/models/__init__.py       |   0
 04-learning-api/templates/help.html      |  57 ++++++++
 9 files changed, 467 insertions(+)
 create mode 100644 04-learning-api/LICENSE.txt
 create mode 100644 04-learning-api/README.md
 create mode 100644 04-learning-api/environment.yml
 create mode 100644 04-learning-api/main.py
 create mode 100644 04-learning-api/manifest.yml
 create mode 100644 04-learning-api/models/ModelFactory.py
 create mode 100644 04-learning-api/models/StandardModels.py
 create mode 100644 04-learning-api/models/__init__.py
 create mode 100644 04-learning-api/templates/help.html

diff --git a/04-learning-api/LICENSE.txt b/04-learning-api/LICENSE.txt
new file mode 100644
index 0000000..416955d
--- /dev/null
+++ b/04-learning-api/LICENSE.txt
@@ -0,0 +1,27 @@
+Copyright (c) 2015, Alexander Kagoshima, Pivotal Software Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of ds-cfpylearning nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/04-learning-api/README.md b/04-learning-api/README.md
new file mode 100644
index 0000000..46d787e
--- /dev/null
+++ b/04-learning-api/README.md
@@ -0,0 +1,60 @@
+# Simple Cloud Foundry based machine learning API
+
+Modified from code originally written by Alexander Kagoshima
+See the full version at https://github.com/alexkago/ds-cfpylearning
+
+This app demonstrates a very simple API that can be used to create model instances, feed data to them and let these models retrain periodically. Currently, it uses redis to store model instances, model state and data as well - for scalability and distributed processing of data this should be replaced by a distributed data storage.
+
+For all the examples below replace ```http://<model_domain>``` with your Cloud Foundry app domain.
+
+
+Create a model
+--
+
+```
+curl -i -X POST -H "Content-Type: application/json" -d '{"model_name": "model1", "model_type": "LinearRegression", "retrain_counter": 10}' http://<model_domain>/createModel
+```
+
+
+Add in some data
+--
+
+This example shows how to send data into the model created before, s.t. the linear regression model becomes y = x. Since we set the retrain_counter to 10 previously, the model will retrain after it received the 10th data instance.
+
+```
+curl -i -X POST -H "Content-Type: application/json" -d '{"model_name": "model1", "input": 1, "label": 1}' http://<model_domain>/ingest
+curl -i -X POST -H "Content-Type: application/json" -d '{"model_name": "model1", "input": 2, "label": 2}' http://<model_domain>/ingest
+curl -i -X POST -H "Content-Type: application/json" -d '{"model_name": "model1", "input": 3, "label": 3}' http://<model_domain>/ingest
+curl -i -X POST -H "Content-Type: application/json" -d '{"model_name": "model1", "input": 4, "label": 4}' http://<model_domain>/ingest
+curl -i -X POST -H "Content-Type: application/json" -d '{"model_name": "model1", "input": 5, "label": 5}' http://<model_domain>/ingest
+curl -i -X POST -H "Content-Type: application/json" -d '{"model_name": "model1", "input": 6, "label": 6}' http://<model_domain>/ingest
+curl -i -X POST -H "Content-Type: application/json" -d '{"model_name": "model1", "input": 7, "label": 7}' http://<model_domain>/ingest
+curl -i -X POST -H "Content-Type: application/json" -d '{"model_name": "model1", "input": 8, "label": 8}' http://<model_domain>/ingest
+curl -i -X POST -H "Content-Type: application/json" -d '{"model_name": "model1", "input": 9, "label": 9}' http://<model_domain>/ingest
+curl -i -X POST -H "Content-Type: application/json" -d '{"model_name": "model1", "input": 10, "label": 10}' http://<model_domain>/ingest
+```
+
+
+Look at all created models
+--
+
+There's a very rudimentary view on the redis set of all models that have been created:
+
+```
+http://<model_domain>/models/
+```
+
+
+Look at model details
+--
+
+This lets you check out the status of the previously created model as well as its trained parameters:
+
+```
+http://<model_domain>/models/model1
+```
+
+License
+--
+
+This application is released under the Modified BSD license. Please see the LICENSE.txt file for details.
diff --git a/04-learning-api/environment.yml b/04-learning-api/environment.yml
new file mode 100644
index 0000000..3da87f6
--- /dev/null
+++ b/04-learning-api/environment.yml
@@ -0,0 +1,22 @@
+name: cfpylearning
+dependencies:
+- flask=0.10.1=py27_1
+- itsdangerous=0.24=py27_0
+- jinja2=2.7.3=py27_1
+- markdown=2.6.2=py27_0
+- markupsafe=0.23=py27_0
+- nose=1.3.7=py27_0
+- numpy=1.9.2=py27_0
+- openssl=1.0.1k=1
+- pip=7.0.3=py27_0
+- python=2.7.10=0
+- readline=6.2=2
+- scikit-learn=0.16.1=np19py27_0
+- scipy=0.15.1=np19py27_0
+- setuptools=17.1.1=py27_0
+- sqlite=3.8.4.1=1
+- tk=8.5.18=0
+- werkzeug=0.10.4=py27_0
+- zlib=1.2.8=0
+- pip:
+  - redis==2.10.3
diff --git a/04-learning-api/main.py b/04-learning-api/main.py
new file mode 100644
index 0000000..f63553a
--- /dev/null
+++ b/04-learning-api/main.py
@@ -0,0 +1,178 @@
+import os
+import json
+import redis
+import pickle
+from markdown import markdown
+from flask import Flask, request, jsonify, abort, make_response, Markup, render_template, g
+from models.StandardModels import LinearRegression
+from models import ModelFactory
+
+app = Flask(__name__)
+
+# Get hostname
+cf_app_env = os.getenv('VCAP_APPLICATION')
+if cf_app_env is not None:
+    host = json.loads(cf_app_env)['application_uris'][0]
+else:
+    host = 'localhost'
+
+# initialize redis connection for local and CF deployment
+def connect_db():
+    if os.environ.get('VCAP_SERVICES') is None: # running locally
+        DB_HOST = 'localhost'
+        DB_PORT = 6379
+        DB_PW = ''
+        REDIS_DB = 1 if app.config["TESTING"] else 0 # use other db for testing
+
+    else:                                       # running on CF
+        env_vars = os.environ['VCAP_SERVICES']
+        rediscloud_service = json.loads(env_vars)['rediscloud'][0]
+        credentials = rediscloud_service['credentials']
+        DB_HOST = credentials['hostname']
+        DB_PORT = credentials['port']
+        DB_PW = password=credentials['password']
+        REDIS_DB = 0
+
+
+    app.r = redis.StrictRedis(host=DB_HOST,
+                              port=DB_PORT,
+                              password=DB_PW,
+                              db=REDIS_DB)
+
+
+# define routes
+@app.route('/')
+def hello():
+
+    return render_template('help.html', host=host)
+
+
+@app.route('/flushDB')
+def flushDB():
+    app.r.flushdb()
+    return 'db flushed', 200
+
+
+@app.route('/createModel', methods=['POST'])
+def createModel():
+    json_data = request.get_json(force=True)
+
+    # check if all fields are there
+    if json_data.get('model_name') is None:
+        abort(make_response("model_name field is missing.\n", 422))
+
+    if json_data.get('model_type') is None:
+        abort(make_response("model_type field is missing.\n", 422))
+
+    if json_data.get('retrain_counter') is None:
+        abort(make_response("no retrain information set.\n", 422))
+
+    # add model to list of models
+    app.r.sadd('models', json_data.get('model_name'))
+
+    # save model definition
+    mdl = ModelFactory.createModel(json_data.get('model_type'),
+                                   json_data.get('model_name'),
+                                   json_data.get('retrain_counter'))
+
+    if mdl is None:
+        return abort(make_response("No model available of type " +
+                                   json_data.get('model_type') + "\n",
+                     422))
+
+    app.r.set(json_data.get('model_name') + '_object', pickle.dumps(mdl))
+
+    return "created model: " + str(mdl) + "\n", 201
+
+
+@app.route('/models')
+def modelOverview():
+    return str(app.r.smembers('models')), 200
+
+
+@app.route('/models/<model_name>')
+def modelInfo(model_name):
+    return str(pickle.loads(app.r.get(model_name + '_object'))), 200
+
+
+@app.route('/ingest', methods=['POST'])
+def ingest():
+    json_data = request.get_json(force=True)
+
+    if json_data.get('model_name') is None:
+        abort(make_response("model_name field is missing.\n", 422))
+
+    # prepare db keys
+    mdl_key = json_data.get('model_name') + '_object'
+    data_key = json_data.get('model_name') + '_data'
+
+    # get the model from the db
+    pickled_mdl = app.r.get(mdl_key)
+    mdl = pickle.loads(pickled_mdl)
+
+    # pre-process data
+    del json_data['model_name']
+    col_names = json_data.keys()
+
+    # update the model
+    if mdl.available_data == 0:
+        mdl.set_data_format(col_names)
+    else:
+        if mdl.col_names != col_names:
+            return abort(make_response("Data format changed!\n", 422))
+
+    mdl.avail_data_incr()
+
+    # save data to redis
+    app.r.rpush(data_key, json.dumps(json_data))
+
+    # kick off re-training
+    if (mdl.available_data % mdl.retrain_counter) == 0:
+        data = app.r.lrange(data_key, 0, mdl.available_data)
+        mdl.train(data)
+
+    # save model file
+    app.r.set(mdl_key, pickle.dumps(mdl))
+
+    return json.dumps(json_data) + " added at " + data_key + "\n", 201
+
+@app.route('/score', methods=['POST'])
+def score():
+    json_data = request.get_json(force=True)
+
+    if json_data.get('model_name') is None:
+        abort(make_response("model_name field is missing.\n", 422))
+
+    # prepare db keys
+    mdl_key = json_data.get('model_name') + '_object'
+    pickled_mdl = app.r.get(mdl_key)
+    mdl = pickle.loads(pickled_mdl)
+
+    if not mdl.trained:
+        return abort(make_response("Model has not been trained yet!\n", 404))
+
+    train_data = dict(json_data)
+    del train_data['model_name']
+    input_keys = mdl.col_names
+    input_keys.remove('label')
+
+    if input_keys != train_data.keys():
+        return abort(make_response("Data format for training is different!\n", 422))
+
+    pred_val = mdl.score([train_data[key] for key in input_keys])
+
+    prediction = {'predicted_label': pred_val[0], 'request': json_data}
+
+    return json.dumps(prediction), 201
+
+# run app
+if __name__ == "__main__":
+    if os.environ.get('VCAP_SERVICES') is None: # running locally
+        PORT = 8080
+        DEBUG = True
+    else:                                       # running on CF
+        PORT = int(os.getenv("VCAP_APP_PORT"))
+        DEBUG = False
+
+    connect_db()
+    app.run(host='0.0.0.0', port=PORT, debug=DEBUG)
diff --git a/04-learning-api/manifest.yml b/04-learning-api/manifest.yml
new file mode 100644
index 0000000..e6c1cc1
--- /dev/null
+++ b/04-learning-api/manifest.yml
@@ -0,0 +1,12 @@
+---
+applications:
+- name: learning-api
+  memory: 512M
+  instances: 1
+  domain: cfapps.io
+  random-route: true
+  path: .
+  buildpack: https://github.com/ihuston/python-conda-buildpack.git
+  command: python main.py
+  services:
+   - myredis
diff --git a/04-learning-api/models/ModelFactory.py b/04-learning-api/models/ModelFactory.py
new file mode 100644
index 0000000..7ddcb75
--- /dev/null
+++ b/04-learning-api/models/ModelFactory.py
@@ -0,0 +1,80 @@
+import json
+import abc
+
+class ModelInterface:
+    __metaclass__  = abc.ABCMeta
+    def __init__(self, model_name, retrain_counter, model_type):
+        self.model_name = model_name
+        self.model_type = model_type
+        self.trained = False
+        self.available_data = 0
+        self.used_training_data = 0
+        self.retrain_counter = retrain_counter
+
+    def avail_data_incr(self):
+        self.available_data += 1
+
+    def set_data_format(self, col_names):
+        self.col_names = col_names
+
+    def update_mdl_state(self):
+        self.used_training_data = self.available_data
+        self.trained = True
+
+    @abc.abstractmethod
+    def get_parameters(self):
+        """This method needs to be implemented"""
+
+    @abc.abstractmethod
+    def train(self, train_data):
+        """This method needs to be implemented"""
+
+    @abc.abstractmethod
+    def score(self, score_data):
+        """This method needs to be implemented"""
+
+    def __eq__(self, other):
+        return (isinstance(other, self.__class__)
+            and self.__dict__ == other.__dict__)
+
+    def __str__(self):
+        obj_dict = self.__dict__
+        if self.trained:
+            obj_dict['parameters'] = self.get_parameters()
+        return str(obj_dict)
+
+
+def train_wrapper(func):
+    def wrapper(self, data):
+        # pre-process data
+        dict_data = [json.loads(el) for el in data]
+        col_names = dict_data[0].keys()
+
+        # # run some update functions on the object
+        # if not self.trained:
+        #     self.set_data_format(col_names)
+        # else:
+        #     if self.col_names != col_names:
+        #         raise InputError('Data format is not the same as used before.')
+
+        # run the actual training function
+        val = func(self, dict_data, col_names)
+
+        # update the model state
+        self.update_mdl_state()
+
+        return val
+
+    return wrapper
+
+
+def createModel(model_type, model_name, retrain_counter):
+    try:
+        import StandardModels
+        return getattr(StandardModels, model_type)(model_name, retrain_counter)
+    except:
+        try:
+            import CustomModels
+            return getattr(CustomModels, model_type)(model_name, retrain_counter)
+        except:
+            return None
diff --git a/04-learning-api/models/StandardModels.py b/04-learning-api/models/StandardModels.py
new file mode 100644
index 0000000..e3facbd
--- /dev/null
+++ b/04-learning-api/models/StandardModels.py
@@ -0,0 +1,31 @@
+from ModelFactory import ModelInterface, train_wrapper
+from sklearn import linear_model
+
+class LinearRegression(ModelInterface):
+    def __init__(self, name, rt_counter):
+        ModelInterface.__init__(self, name, rt_counter, 'LinearRegression')
+
+    @train_wrapper
+    def train(self, data, col_names):
+        col_names.remove('label')
+
+        x = [[el[key] for key in col_names] for el in data]
+        y = [el['label'] for el in data]
+
+        self.mdl = linear_model.LinearRegression()
+        self.mdl.fit(x, y, 1)
+
+        return self.get_parameters()
+
+    def score(self, data):
+        return self.mdl.predict(data)
+
+    def get_parameters(self):
+        coefficients = self.mdl.coef_.tolist()
+        coefficients.append(self.mdl.intercept_)
+
+        col_names = self.col_names[:]
+        col_names.remove('label')
+        col_names.append('constant')
+
+        return dict(zip(col_names, coefficients))
diff --git a/04-learning-api/models/__init__.py b/04-learning-api/models/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/04-learning-api/templates/help.html b/04-learning-api/templates/help.html
new file mode 100644
index 0000000..526da3a
--- /dev/null
+++ b/04-learning-api/templates/help.html
@@ -0,0 +1,57 @@
+<html>
+<head>
+    <title>CF based learning API</title>
+</head>
+<body>
+    <h1>Simple Cloud Foundry based machine learning API</h1>
+<p>
+Modified from code originally written by Alexander Kagoshima
+See the full version at <a href="https://github.com/alexkago/ds-cfpylearning">https://github.com/alexkago/ds-cfpylearning</a>
+</p>
+<p>
+This app demonstrates a very simple API that can be used to create model instances, feed data to them and let these models retrain periodically. Currently, it uses redis to store model instances, model state and data as well - for scalability and distributed processing of data this should be replaced by distributed data storage.
+
+
+<h2>Create a model</h2>
+
+<pre>
+curl -i -X POST -H "Content-Type: application/json" -d '{"model_name": "model1", "model_type": "LinearRegression", "retrain_counter": 10}' http://{{ host }}/createModel
+</pre>
+
+
+<h2>Add in some data</h2>
+
+This example shows how to send data into the model created before, s.t. the linear regression model becomes y = x. Since we set the retrain_counter to 10 previously, the model will retrain after it received the 10th data instance.
+
+<pre>
+curl -i -X POST -H "Content-Type: application/json" -d '{"model_name": "model1", "input": 1, "label": 1}' http://{{ host }}/ingest
+curl -i -X POST -H "Content-Type: application/json" -d '{"model_name": "model1", "input": 2, "label": 2}' http://{{ host }}/ingest
+curl -i -X POST -H "Content-Type: application/json" -d '{"model_name": "model1", "input": 3, "label": 3}' http://{{ host }}/ingest
+curl -i -X POST -H "Content-Type: application/json" -d '{"model_name": "model1", "input": 4, "label": 4}' http://{{ host }}/ingest
+curl -i -X POST -H "Content-Type: application/json" -d '{"model_name": "model1", "input": 5, "label": 5}' http://{{ host }}/ingest
+curl -i -X POST -H "Content-Type: application/json" -d '{"model_name": "model1", "input": 6, "label": 6}' http://{{ host }}/ingest
+curl -i -X POST -H "Content-Type: application/json" -d '{"model_name": "model1", "input": 7, "label": 7}' http://{{ host }}/ingest
+curl -i -X POST -H "Content-Type: application/json" -d '{"model_name": "model1", "input": 8, "label": 8}' http://{{ host }}/ingest
+curl -i -X POST -H "Content-Type: application/json" -d '{"model_name": "model1", "input": 9, "label": 9}' http://{{ host }}/ingest
+curl -i -X POST -H "Content-Type: application/json" -d '{"model_name": "model1", "input": 10, "label": 10}' http://{{ host }}/ingest
+</pre>
+
+<h2>Score a new datapoint</h2>
+Now we can score a new datapoint by using the /score endpoint:
+<pre>
+curl -i -X POST -H "Content-Type: application/json" -d '{"model_name": "model1", "input": 3.5}' http://{{ host }}/score
+</pre>
+
+<h2>Look at all created models</h2>
+
+There's a very rudimentary view on the redis set of all models that have been created:
+<a href="http://{{ host }}/models">http://{{ host }}/models</a>.
+
+<h2>Look at model details</h2>
+
+This lets you check out the status of the previously created model as well as its trained parameters:
+<a href="http://{{ host }}/models/model1">http://{{ host }}/models/model1</a>
+
+</p>
+</body>
+</html>