python/timeseries/activity_word2vec.py

import random
import numpy as np
import numpy.random as rnd
from pandas import concat
from sklearn.preprocessing import MinMaxScaler
from sklearn import manifold
import collections
import math
import tensorflow as tf
import matplotlib.image as mpimg
from common.utils import *
from common.timeseries_datasets import *
from common.data_plotter import *
from common.nn_utils import *
from timeseries.casas import *
from timeseries.word2vec import *
from timeseries.word2vec_custom import *

from timeseries.simulate_timeseries import *

# Just disables the warning, doesn't enable AVX/FMA
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

"""
Simple word2vec model for acitivity labels. We will try to find an embedding
of the sensors in an apartment. This embedding will be derived from the
sequential sensor triggerings as the occupant of the apartment moves or
conducts daily activities. The goal is to show that relative positions of 
sensors in the embedding space reflect their relative positions in the real-world.

To compare the embedding and the real layout, check the pdf output generated by
this program and compare with ../datasets/CASAS/floor_plans/HH101-sensormap.png

This is a simple application of word2vec on a small dataset and should be good 
for educational purposes.

The activity dataset is obtained from: http://casas.wsu.edu/datasets/hh101.zip
This dataset has sensor readings for a single-occupant home. Multiple sensors are
placed at various locations in an apartment and they turn ON/OFF based on the
occupant's movements (we only use the sensors whose ids start with 'M' or 'MA').

The related floor plan showing the sensor layout can be found at:
    http://ailab.wsu.edu/casas/hh/hh101/profile/page-6.html

More details on the setup can be found in:
    D. Cook, A. Crandall, B. Thomas, and N. Krishnan.
    CASAS: A smart home in a box. IEEE Computer, 46(7):62-69, 2013.
    
    and on the CASAS website: http://ailab.wsu.edu/casas/hh

To execute:
pythonw -m timeseries.activity_word2vec --log_file=temp/timeseries/activity_word2vec.log --n_epochs=1 --debug
"""


if __name__ == "__main__":

    logger = logging.getLogger(__name__)

    args = get_command_args(debug=False, debug_args=["--n_lags=20",
                                                     "--n_epochs=1",
                                                     "--debug",
                                                     "--plot",
                                                     "--log_file=temp/timeseries/activity_word2vec.log"])
    # print "log file: %s" % args.log_file
    configure_logger(args)

    dir_create("./temp/timeseries")  # for logging and plots

    random.seed(42)
    rnd.seed(42)

    localpath = "./temp/timeseries"
    url = "http://casas.wsu.edu/datasets/"
    casasfile = maybe_download_casas("hh101.zip", url, "hh101/ann.txt", localpath)
    logger.debug("casasfile: %s" % casasfile)
    cas = Casas(dataset_path=casasfile)

    logger.debug("cas.sensor2code:\n%s" % str(cas.sensor2code))
    logger.debug("cas.code2sensor:\n%s" % str(cas.code2sensor))
    logger.debug("cas.sensor_seq:\n%s" % str(cas.sensor_seq[0:10]))
    logger.debug("cas.sensor_enc:\n%s" % str(cas.sensor_enc[0:10]))

    timer = Timer()

    modes = {0: "", 1: "custom", 2: "original"}

    mode = 2

    dims = 15
    window_size = 3
    n_epochs = args.n_epochs
    neg_samples = 3
    normalize_embeddings = False
    use_tsne = dims > 2

    w2v = None
    embeddings = None

    signature = "d%d%s%s%s" % (dims, "_norm" if normalize_embeddings else "",
                               "_" + modes[mode], "_tsne" if use_tsne else "")

    logger.debug("signature: %s" % signature)

    # write_sensor_data_as_document(cas)

    if mode == 1:
        w2v = CustomWord2vec(sensors=cas.sensors,
                             code2sensor=cas.code2sensor, sensor2code=cas.sensor2code,
                             dims=dims, window_size=window_size, neg_samples=neg_samples,
                             n_epochs=n_epochs, debug=args.debug)
        w2v.fit(cas.sensor_enc)
        logger.debug(timer.message("Completed training in"))
        embeddings = w2v.get_embeddings(normalized=normalize_embeddings)
    elif mode == 2:
        w2v = Word2vec(sensors=cas.sensors,
                       code2sensor=cas.code2sensor, sensor2code=cas.sensor2code,
                       dims=dims, window_size=window_size, n_epochs=n_epochs, num_skips=2,
                       num_sampled=neg_samples, num_steps=100001, debug=args.debug)
        w2v.fit(cas.sensor_enc)
        embeddings = w2v.get_embeddings(normalized=normalize_embeddings)

    if embeddings is not None:
        logger.debug(embeddings.shape)
        # logger.debug(embeddings)

        if not use_tsne:
            x_tr = embeddings
        else:
            logger.debug("computing t-SNE for embedded space...")
            # perplexity=30, as used in original word2vec, does not result
            # in good visualization...
            tsne_embed = manifold.TSNE(perplexity=3,
                                       n_components=2, init='pca',
                                       random_state=0, method='exact', n_iter=5000)
            x_tr = tsne_embed.fit_transform(embeddings)

        # get the floor plan image
        # img = mpimg.imread("../datasets/CASAS/floor_plans/HH101-sensormap.png")

        pdfpath = "temp/timeseries/activity_sensors_%s.pdf" % signature
        dp = DataPlotter(pdfpath=pdfpath, rows=1, cols=1)

        # pl = dp.get_next_plot()
        # pl.imshow(img)

        pl = dp.get_next_plot()
        dp.plot_points(x_tr, pl, marker='+', s=20)
        for k in cas.code2sensor.keys():
            pl.text(x_tr[k, 0], x_tr[k, 1], cas.code2sensor[k])

        dp.close()