From a716df1ad999c4fc29d3802463411138b33c95c2 Mon Sep 17 00:00:00 2001 From: yetiancn Date: Fri, 7 Sep 2018 16:23:56 -0400 Subject: [PATCH] create brain/selectivity; create new test file for augmented_nn. --- src/brain/modelgen/augmented_nn.py | 115 +++++++ .../augmented_nn.cpp} | 12 +- .../selectivity/selectivity_defaults.cpp | 27 ++ src/brain/workload/workload_defaults.cpp | 10 - .../augmented_nn.h} | 4 +- .../brain/selectivity/selectivity_defaults.h | 33 ++ .../brain/workload/workload_defaults.h | 9 - test/CMakeLists.txt | 2 + test/brain/augmented_nn_test.cpp | 71 +++++ test/brain/model_test.cpp | 44 --- test/brain/testing_augmented_nn_util.cpp | 297 ++++++++++++++++++ test/brain/testing_forecast_util.cpp | 286 ----------------- .../include/brain/testing_augmented_nn_util.h | 30 ++ test/include/brain/testing_forecast_util.h | 18 -- 14 files changed, 583 insertions(+), 375 deletions(-) create mode 100644 src/brain/modelgen/augmented_nn.py rename src/brain/{workload/augmentedNN.cpp => selectivity/augmented_nn.cpp} (92%) create mode 100644 src/brain/selectivity/selectivity_defaults.cpp rename src/include/brain/{workload/augmentedNN.h => selectivity/augmented_nn.h} (94%) create mode 100644 src/include/brain/selectivity/selectivity_defaults.h create mode 100644 test/brain/augmented_nn_test.cpp create mode 100644 test/brain/testing_augmented_nn_util.cpp create mode 100644 test/include/brain/testing_augmented_nn_util.h diff --git a/src/brain/modelgen/augmented_nn.py b/src/brain/modelgen/augmented_nn.py new file mode 100644 index 00000000000..a05055a9e14 --- /dev/null +++ b/src/brain/modelgen/augmented_nn.py @@ -0,0 +1,115 @@ +#===----------------------------------------------------------------------===# +# +# Peloton +# +# AugmentedNN.py +# +# Identification: src/brain/modelgen/AugmentedNN.py +# +# Copyright (c) 2015-2018, Carnegie Mellon University Database Group +# +#===----------------------------------------------------------------------===# + +import tensorflow as tf +import functools +import os +import argparse + +def lazy_property(function): + attribute = '_cache_' + function.__name__ + + @property + @functools.wraps(function) + def decorator(self): + if not hasattr(self, attribute): + setattr(self, attribute, function(self)) + return getattr(self, attribute) + + return decorator + +class AugmentedNN: + + def __init__(self, column_num, order=1, neuron_num=16, lr=0.1, **kwargs): + tf.reset_default_graph() + self.data = tf.placeholder(tf.float32, [None, column_num*2], name="data_") + self.target = tf.placeholder(tf.float32, [None, 1], name="target_") + self._column_num = column_num + self._order = order + self._neuron_num = neuron_num + self._lr = tf.placeholder_with_default(lr, shape=None, + name="learn_rate_") + self.tf_init = tf.global_variables_initializer + self.prediction + self.loss + self.optimize + + @staticmethod + def jump_activation(k): + """ + This is an activation function used to learn discontinuous functions. + Reference: https://dl.acm.org/citation.cfm?id=2326898 + """ + def jump_activation_k(x): + return tf.pow(tf.maximum(0.0, 1-tf.exp(-x)), k) + return jump_activation_k + + @lazy_property + def prediction(self): + net = self.data + kernel_init = tf.random_normal_initializer(mean=0.0001, stddev=0.0001) + with tf.name_scope("hidden_layer"): + net_shape = tf.shape(net) + bsz = net_shape[0] + + h1_layers = [] + for i in range(1, self._order+1): + h1 = tf.layers.dense(net, self._neuron_num, + activation=self.jump_activation(i), + kernel_initializer=kernel_init) + h1_layers.append(h1) + h1_layers = tf.concat(h1_layers, 1) + with tf.name_scope("output_layer"): + net = tf.layers.dense(h1_layers, 1, + activation=self.jump_activation(1), + kernel_initializer=kernel_init) + net = tf.reshape(net, [bsz, -1], name="pred_") + return net + + @lazy_property + def loss(self): + loss = tf.reduce_mean(tf.squared_difference(self.target, self.prediction), name='lossOp_') + return loss + + @lazy_property + def optimize(self): + params = tf.trainable_variables() + gradients = tf.gradients(self.loss, params) + optimizer = tf.train.AdagradOptimizer(learning_rate=self._lr) + return optimizer.apply_gradients(zip(gradients, + params), name="optimizeOp_") + + def write_graph(self, dir): + fname = "{}.pb".format(self.__repr__()) + abs_path = os.path.join(dir, fname) + if not os.path.exists(abs_path): + tf.train.write_graph(tf.get_default_graph(), + dir, fname, False) + + def __repr__(self): + return "augmented_nn" + +def main(): + parser = argparse.ArgumentParser(description='AugmentedNN Model Generator') + + parser.add_argument('--column_num', type=int, default=1, help='Number of augmentedNN Hidden units') + parser.add_argument('--order', type=int, default=3, help='Max order of activation function') + parser.add_argument('--neuron_num', type=int, default=20, help='Number of neurons in hidden layer') + parser.add_argument('--lr', type=float, default=0.001, help='Learning rate') + parser.add_argument('graph_out_path', type=str, help='Path to write graph output', nargs='+') + args = parser.parse_args() + model = AugmentedNN(args.column_num, args.order, args.neuron_num, args.lr) + model.tf_init() + model.write_graph(' '.join(args.graph_out_path)) + +if __name__ == '__main__': + main() diff --git a/src/brain/workload/augmentedNN.cpp b/src/brain/selectivity/augmented_nn.cpp similarity index 92% rename from src/brain/workload/augmentedNN.cpp rename to src/brain/selectivity/augmented_nn.cpp index f57544edc9b..a6878e3865a 100644 --- a/src/brain/workload/augmentedNN.cpp +++ b/src/brain/selectivity/augmented_nn.cpp @@ -2,15 +2,15 @@ // // Peloton // -// augmentedNN.cpp +// augmented_nn.cpp // -// Identification: src/brain/workload/augmentedNN.cpp +// Identification: src/brain/workload/augmented_nn.cpp // // Copyright (c) 2015-2018, Carnegie Mellon University Database Group // //===----------------------------------------------------------------------===// -#include "brain/workload/augmentedNN.h" +#include "brain/selectivity/augmented_nn.h" #include "brain/util/model_util.h" #include "brain/util/tf_session_entity/tf_session_entity.h" #include "brain/util/tf_session_entity/tf_session_entity_input.h" @@ -22,8 +22,8 @@ namespace brain { AugmentedNN::AugmentedNN(int column_num, int order, int neuron_num, float learn_rate, int batch_size, int epochs) - : BaseTFModel("src/brain/modelgen", "src/brain/modelgen/AugmentedNN.py", - "src/brain/modelgen/AugmentedNN.pb"), + : BaseTFModel("src/brain/modelgen", "src/brain/modelgen/augmented_nn.py", + "src/brain/modelgen/augmented_nn.pb"), column_num_(column_num), order_(order), neuron_num_(neuron_num), @@ -49,7 +49,7 @@ std::string AugmentedNN::ConstructModelArgsString() const { std::string AugmentedNN::ToString() const { std::stringstream model_str_builder; - model_str_builder << "AugmentedNN("; + model_str_builder << "augmented_nn("; model_str_builder << "column_num = " << column_num_; model_str_builder << ", order = " << order_; model_str_builder << ", neuron_num = " << neuron_num_; diff --git a/src/brain/selectivity/selectivity_defaults.cpp b/src/brain/selectivity/selectivity_defaults.cpp new file mode 100644 index 00000000000..c1b8254edc1 --- /dev/null +++ b/src/brain/selectivity/selectivity_defaults.cpp @@ -0,0 +1,27 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// selectivity_defaults.cpp +// +// Identification: src/brain/workload/selectivity_defaults.cpp +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#include "brain/selectivity/selectivity_defaults.h" + +namespace peloton { +namespace brain { + +const int AugmentedNNDefaults::COLUMN_NUM = 1; +const int AugmentedNNDefaults::ORDER = 1; +const int AugmentedNNDefaults::NEURON_NUM = 16; +const float AugmentedNNDefaults::LR = 0.1f; +const int AugmentedNNDefaults::BATCH_SIZE = 256; +const int AugmentedNNDefaults::EPOCHS = 600; + + +} // namespace brain +} // namespace peloton diff --git a/src/brain/workload/workload_defaults.cpp b/src/brain/workload/workload_defaults.cpp index f7bb548a4c4..529299db9a6 100644 --- a/src/brain/workload/workload_defaults.cpp +++ b/src/brain/workload/workload_defaults.cpp @@ -36,15 +36,5 @@ const int LinearRegWorkloadDefaults::BPTT = 90; const int KernelRegWorkloadDefaults::BPTT = 90; - -const int AugmentedNNWorkloadDefaults::COLUMN_NUM = 1; -const int AugmentedNNWorkloadDefaults::ORDER = 1; -const int AugmentedNNWorkloadDefaults::NEURON_NUM = 16; -const float AugmentedNNWorkloadDefaults::LR = 0.1f; -const int AugmentedNNWorkloadDefaults::BATCH_SIZE = 256; -const int AugmentedNNWorkloadDefaults::EPOCHS = 600; - - - } // namespace brain } // namespace peloton diff --git a/src/include/brain/workload/augmentedNN.h b/src/include/brain/selectivity/augmented_nn.h similarity index 94% rename from src/include/brain/workload/augmentedNN.h rename to src/include/brain/selectivity/augmented_nn.h index 71caca8e55f..f39715b665f 100644 --- a/src/include/brain/workload/augmentedNN.h +++ b/src/include/brain/selectivity/augmented_nn.h @@ -2,9 +2,9 @@ // // Peloton // -// augmentedNN.h +// augmented_nn.h // -// Identification: src/include/brain/workload/augmentedNN.h +// Identification: src/include/brain/workload/augmented_nn.h // // Copyright (c) 2015-2018, Carnegie Mellon University Database Group // diff --git a/src/include/brain/selectivity/selectivity_defaults.h b/src/include/brain/selectivity/selectivity_defaults.h new file mode 100644 index 00000000000..c7aad20e3a4 --- /dev/null +++ b/src/include/brain/selectivity/selectivity_defaults.h @@ -0,0 +1,33 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// selectivity_defaults.h +// +// Identification: src/include/brain/workload/selectivity_defaults.h +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#pragma once + +/** + * This header file contains default attributes + * associated with the selectivity prediction task + **/ + +namespace peloton { +namespace brain { + +struct AugmentedNNDefaults { + static const int COLUMN_NUM; + static const int ORDER; + static const int NEURON_NUM; + static const float LR; + static const int BATCH_SIZE; + static const int EPOCHS; +}; + +} // namespace brain +} // namespace peloton diff --git a/src/include/brain/workload/workload_defaults.h b/src/include/brain/workload/workload_defaults.h index 3c91affd7bd..82ca9604d4b 100644 --- a/src/include/brain/workload/workload_defaults.h +++ b/src/include/brain/workload/workload_defaults.h @@ -65,14 +65,5 @@ struct KernelRegWorkloadDefaults { static const int BPTT; }; -struct AugmentedNNWorkloadDefaults { - static const int COLUMN_NUM; - static const int ORDER; - static const int NEURON_NUM; - static const float LR; - static const int BATCH_SIZE; - static const int EPOCHS; -}; - } // namespace brain } // namespace peloton diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index edff00eecba..9276087430f 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -49,6 +49,7 @@ set(TESTING_UTIL_SQL ${PROJECT_SOURCE_DIR}/test/sql/testing_sql_util.cpp) set(TESTING_UTIL_INDEX ${PROJECT_SOURCE_DIR}/test/index/testing_index_util.cpp) set(TESTING_UTIL_CODEGEN ${PROJECT_SOURCE_DIR}/test/codegen/testing_codegen_util.cpp) set(TESTING_UTIL_FORECAST ${PROJECT_SOURCE_DIR}/test/brain/testing_forecast_util.cpp) +set(TESTING_UTIL_AUGMENTEDNN ${PROJECT_SOURCE_DIR}/test/brain/testing_augmented_nn_util.cpp) add_library(peloton-test-common EXCLUDE_FROM_ALL ${gmock_srcs} ${HARNESS} ${TESTING_UTIL_EXECUTOR} @@ -60,6 +61,7 @@ add_library(peloton-test-common EXCLUDE_FROM_ALL ${gmock_srcs} ${HARNESS} ${TESTING_UTIL_SQL} ${TESTING_UTIL_CODEGEN} ${TESTING_UTIL_FORECAST} + ${TESTING_UTIL_AUGMENTEDNN} ) # --[ Add "make check" target diff --git a/test/brain/augmented_nn_test.cpp b/test/brain/augmented_nn_test.cpp new file mode 100644 index 00000000000..c370771f552 --- /dev/null +++ b/test/brain/augmented_nn_test.cpp @@ -0,0 +1,71 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// augmented_nn_test.cpp +// +// Identification: test/brain/augmented_nn_test.cpp +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#include +#include "brain/selectivity/augmented_nn.h" +#include "brain/selectivity/selectivity_defaults.h" +#include "brain/testing_augmented_nn_util.h" +#include "brain/util/model_util.h" +#include "brain/workload/workload_defaults.h" +#include "common/harness.h" +#include "util/file_util.h" + +namespace peloton { +namespace test { +class AugmentedNNTests : public PelotonTest {}; + +TEST_F(AugmentedNNTests, DISABLED_AugmentedNNUniformTest) { + auto model = std::unique_ptr(new brain::AugmentedNN( + brain::AugmentedNNDefaults::COLUMN_NUM, + brain::AugmentedNNDefaults::ORDER, + brain::AugmentedNNDefaults::NEURON_NUM, + brain::AugmentedNNDefaults::LR, + brain::AugmentedNNDefaults::BATCH_SIZE, + brain::AugmentedNNDefaults::EPOCHS)); + EXPECT_TRUE(model->IsTFModel()); + size_t LOG_INTERVAL = 20; + size_t NUM_SAMPLES = 10000; + float VAL_SPLIT = 0.5; + bool NORMALIZE = false; + float VAL_THESH = 0.05; + + TestingAugmentedNNUtil::Test(*model, DistributionType::UniformDistribution, + LOG_INTERVAL, NUM_SAMPLES, + VAL_SPLIT, NORMALIZE, VAL_THESH, + brain::CommonWorkloadDefaults::ESTOP_PATIENCE, + brain::CommonWorkloadDefaults::ESTOP_DELTA); +} + +TEST_F(AugmentedNNTests, DISABLED_AugmentedNNSkewedTest) { + auto model = std::unique_ptr(new brain::AugmentedNN( + brain::AugmentedNNDefaults::COLUMN_NUM, + brain::AugmentedNNDefaults::ORDER, + brain::AugmentedNNDefaults::NEURON_NUM, + brain::AugmentedNNDefaults::LR, + brain::AugmentedNNDefaults::BATCH_SIZE, + brain::AugmentedNNDefaults::EPOCHS)); + EXPECT_TRUE(model->IsTFModel()); + size_t LOG_INTERVAL = 20; + size_t NUM_SAMPLES = 10000; + float VAL_SPLIT = 0.5; + bool NORMALIZE = false; + float VAL_THESH = 0.05; + + TestingAugmentedNNUtil::Test(*model, DistributionType::SkewedDistribution, + LOG_INTERVAL, NUM_SAMPLES, + VAL_SPLIT, NORMALIZE, VAL_THESH, + brain::CommonWorkloadDefaults::ESTOP_PATIENCE, + brain::CommonWorkloadDefaults::ESTOP_DELTA); +} + +} // namespace test +} // namespace peloton diff --git a/test/brain/model_test.cpp b/test/brain/model_test.cpp index 6ea96d3d018..1a26f3ed603 100644 --- a/test/brain/model_test.cpp +++ b/test/brain/model_test.cpp @@ -16,7 +16,6 @@ #include "brain/workload/kernel_model.h" #include "brain/workload/linear_model.h" #include "brain/workload/lstm.h" -#include "brain/workload/augmentedNN.h" #include "brain/workload/workload_defaults.h" #include "common/harness.h" #include "util/file_util.h" @@ -136,49 +135,6 @@ TEST_F(ModelTests, DISABLED_TimeSeriesEnsembleTest) { brain::CommonWorkloadDefaults::ESTOP_DELTA); } -TEST_F(ModelTests, DISABLED_AugmentedNNUniformTest) { - auto model = std::unique_ptr(new brain::AugmentedNN( - brain::AugmentedNNWorkloadDefaults::COLUMN_NUM, - brain::AugmentedNNWorkloadDefaults::ORDER, - brain::AugmentedNNWorkloadDefaults::NEURON_NUM, - brain::AugmentedNNWorkloadDefaults::LR, - brain::AugmentedNNWorkloadDefaults::BATCH_SIZE, - brain::AugmentedNNWorkloadDefaults::EPOCHS)); - EXPECT_TRUE(model->IsTFModel()); - size_t LOG_INTERVAL = 20; - size_t NUM_SAMPLES = 10000; - float VAL_SPLIT = 0.5; - bool NORMALIZE = false; - float VAL_THESH = 0.05; - - TestingAugmentedNNUtil::Test(*model, DistributionType::UniformDistribution, - LOG_INTERVAL, NUM_SAMPLES, - VAL_SPLIT, NORMALIZE, VAL_THESH, - brain::CommonWorkloadDefaults::ESTOP_PATIENCE, - brain::CommonWorkloadDefaults::ESTOP_DELTA); -} - -TEST_F(ModelTests, DISABLED_AugmentedNNSkewedTest) { - auto model = std::unique_ptr(new brain::AugmentedNN( - brain::AugmentedNNWorkloadDefaults::COLUMN_NUM, - brain::AugmentedNNWorkloadDefaults::ORDER, - brain::AugmentedNNWorkloadDefaults::NEURON_NUM, - brain::AugmentedNNWorkloadDefaults::LR, - brain::AugmentedNNWorkloadDefaults::BATCH_SIZE, - brain::AugmentedNNWorkloadDefaults::EPOCHS)); - EXPECT_TRUE(model->IsTFModel()); - size_t LOG_INTERVAL = 20; - size_t NUM_SAMPLES = 10000; - float VAL_SPLIT = 0.5; - bool NORMALIZE = false; - float VAL_THESH = 0.05; - - TestingAugmentedNNUtil::Test(*model, DistributionType::SkewedDistribution, - LOG_INTERVAL, NUM_SAMPLES, - VAL_SPLIT, NORMALIZE, VAL_THESH, - brain::CommonWorkloadDefaults::ESTOP_PATIENCE, - brain::CommonWorkloadDefaults::ESTOP_DELTA); -} } // namespace test } // namespace peloton diff --git a/test/brain/testing_augmented_nn_util.cpp b/test/brain/testing_augmented_nn_util.cpp new file mode 100644 index 00000000000..648ec7dca3c --- /dev/null +++ b/test/brain/testing_augmented_nn_util.cpp @@ -0,0 +1,297 @@ +#include "brain/testing_augmented_nn_util.h" +#include +#include "brain/util/model_util.h" +#include "brain/util/eigen_util.h" +#include "common/harness.h" +#include + +namespace peloton { +namespace test { + +void TestingAugmentedNNUtil::Test( + brain::AugmentedNN &model, DistributionType d, + size_t val_interval, size_t num_samples, + float val_split, bool normalize, float val_loss_thresh, + size_t early_stop_patience, float early_stop_delta) { + LOG_INFO("Using Model: %s", model.ToString().c_str()); + size_t num_tests = model.GetBatchsize(); + matrix_eig all_data = GetData(d, num_samples, num_tests); + + matrix_eig test_data = all_data.bottomRows(num_tests*3); + matrix_eig data = all_data.topRows(all_data.rows() - num_tests*3); + + brain::Normalizer n(normalize); + val_interval = std::min(val_interval, model.GetEpochs()); + + // Determine the split point + size_t split_point = + data.rows() - static_cast(data.rows() * val_split); + + // Split into train/validate data + matrix_eig train_data = data.topRows(split_point); + n.Fit(train_data); + train_data = n.Transform(train_data); + matrix_eig validate_data = + n.Transform(data.bottomRows( + static_cast(data.rows() - split_point))); + + vector_eig train_loss_avg = vector_eig::Zero(val_interval); + float prev_train_loss = std::numeric_limits::max(); + float val_loss = val_loss_thresh * 2; + std::vector val_losses; + for (int epoch = 1; epoch <= model.GetEpochs() && + !brain::ModelUtil::EarlyStop( + val_losses, early_stop_patience, early_stop_delta); + epoch++) { + auto train_loss = model.TrainEpoch(train_data); + size_t idx = (epoch - 1) % val_interval; + train_loss_avg(idx) = train_loss; + if (epoch % val_interval == 0) { + val_loss = model.ValidateEpoch(validate_data); + train_loss = train_loss_avg.mean(); + EXPECT_LE(train_loss, prev_train_loss); + LOG_DEBUG("Train Loss: %.10f, Valid Loss: %.10f", train_loss, val_loss); + prev_train_loss = train_loss; + } + } + EXPECT_LE(val_loss, val_loss_thresh); + + matrix_eig check_data = + test_data.block(0, 0, test_data.rows(), test_data.cols() - 1); + matrix_eig check_target_data = + test_data.block(0, test_data.cols() - 1, test_data.rows(), 1); + + matrix_eig test_res = model.Predict(check_data, num_tests*3); + + LOG_INFO("Test with on high end: "); + for (size_t i = 0; i < 10; i++) { + LOG_INFO("Truth: %.8f, Pred: %.8f", + check_target_data(i,0), test_res(i,0)); + } + float test_loss = peloton::brain::ModelUtil::MeanSqError( + check_target_data.topRows(num_tests), + test_res.topRows(num_tests)); + LOG_INFO("AMSE: %.8f", test_loss); + + LOG_INFO("Test with on low end: "); + for (size_t i = num_tests; i < num_tests + 10; i++) { + LOG_INFO("Truth: %.8f, Pred: %.8f", + check_target_data(i,0), test_res(i,0)); + } + test_loss = peloton::brain::ModelUtil::MeanSqError( + check_target_data.middleRows(num_tests, num_tests), + test_res.middleRows(num_tests, num_tests)); + LOG_INFO("AMSE: %.8f", test_loss); + + LOG_INFO("Test randomly: "); + for (size_t i = 2 * num_tests; i < 2 * num_tests + 10; i++) { + LOG_INFO("Truth: %.8f, Pred: %.8f", + check_target_data(i,0), test_res(i,0)); + } + test_loss = peloton::brain::ModelUtil::MeanSqError( + check_target_data.bottomRows(num_tests), + test_res.bottomRows(num_tests)); + LOG_INFO("AMSE: %.8f", test_loss); + +} + + +matrix_eig TestingAugmentedNNUtil::GetData(DistributionType d, + size_t num_samples, + size_t num_tests) { + matrix_eig data; + switch (d) { + case DistributionType::UniformDistribution: { + int NUM_X = 1000; + matrix_eig hist = matrix_eig::Zero(NUM_X + 1, 1); + matrix_eig sum = matrix_eig::Zero(NUM_X + 1, 1); + float sum_hist = 0; + for (int i = 1; i <= NUM_X; i++) { + hist(i, 0) = 100; + } + + for (int i = 1; i <= NUM_X; i++) { + sum(i, 0) = sum(i - 1, 0) + hist(i, 0); + } + sum_hist = sum(NUM_X, 0); + + // generate training and validating data randomly + data = matrix_eig::Zero(num_samples, 3); //3:lowerbound, upperbound, sel + std::mt19937 rng; + rng.seed(std::random_device()()); + std::uniform_int_distribution dist(1, NUM_X); + + // data: [lowerbound, upperbound, truth selectivity] + for (size_t i = 0; i < num_samples; i++) { + int l = dist(rng); + int u = dist(rng); + if (l > u) { + std::swap(l, u); + } + float sel = (sum(u, 0) - sum(l - 1, 0)) / sum_hist; + // assume the max&min values of the col are known + // so here preprocessing([min,max]->[-1,1]) can be done + data(i, 0) = (2 / (float)NUM_X) * l - 1; + data(i, 1) = (2 / (float)NUM_X) * u - 1; + data(i, 2) = sel; + } + + float HIGH_SEL = 0.8; + float LOW_SEL = 0.2; + + matrix_eig test_random_data = matrix_eig::Zero(num_tests, 3); + matrix_eig test_highsel_data = matrix_eig::Zero(num_tests, 3); + matrix_eig test_lowsel_data = matrix_eig::Zero(num_tests, 3); + + // generate test data with high selectivity + for (size_t i = 0; i < num_tests; i++) { + int l, u; + float sel; + do { + l = dist(rng); + u = dist(rng); + if (l > u) { + std::swap(l, u); + } + sel = (sum(u, 0) - sum(l - 1, 0)) / sum_hist; + } while(sel <= HIGH_SEL); + test_highsel_data(i, 0) = (2 / (float)NUM_X) * l - 1; + test_highsel_data(i, 1) = (2 / (float)NUM_X) * u - 1; + test_highsel_data(i, 2) = sel; + } + + // generate test data with low selectivity + for (size_t i = 0; i < num_tests; i++) { + int l, u; + float sel; + do { + l = dist(rng); + u = dist(rng); + if (l > u) { + std::swap(l, u); + } + sel = (sum(u, 0) - sum(l - 1, 0)) / sum_hist; + } while(sel >= LOW_SEL); + test_lowsel_data(i, 0) = (2 / (float)NUM_X) * l - 1; + test_lowsel_data(i, 1) = (2 / (float)NUM_X) * u - 1; + test_lowsel_data(i, 2) = sel; + } + + // generate test data with random selectivity + for (size_t i = 0; i < num_tests; i++) { + int l = dist(rng); + int u = dist(rng); + if (l > u) { + std::swap(l, u); + } + float sel = (sum(u, 0) - sum(l - 1,0)) / sum_hist; + test_random_data(i, 0) = (2 / (float)NUM_X) * l - 1; + test_random_data(i, 1) = (2 / (float)NUM_X) * u - 1; + test_random_data(i, 2) = sel; + } + + std::vector data_vec = {data, test_highsel_data, + test_lowsel_data, test_random_data}; + data = peloton::brain::EigenUtil::VStack(data_vec); + + break; + } + case DistributionType::SkewedDistribution: { + // generate skewed dataset + int NUM_X = 1000; + matrix_eig hist = matrix_eig::Zero(NUM_X + 1, 1); + matrix_eig sum = matrix_eig::Zero(NUM_X + 1, 1); + float sum_hist = 0; + + // skewed + for (int i = 1; i < 100; i++) { + hist(i, 0) = 2 + std::round(100 * + std::exp(-0.001 * std::pow(i - 100.0, 2))); + } + for (int i = 100; i <= NUM_X; i++) { + hist(i, 0) = 2 + std::round(100 * + std::exp(-0.00008 * std::pow(i - 100.0, 2))); + } + + for (int i = 1; i <= NUM_X; i++) { + sum(i, 0) = sum(i - 1, 0) + hist(i, 0); + } + sum_hist = sum(NUM_X, 0); + + // generate training and testing data randomly + data = matrix_eig::Zero(num_samples, 3); + std::mt19937 rng; + rng.seed(std::random_device()()); + std::uniform_int_distribution dist(1, NUM_X); + + // data: [lowerbound, upperbound, truth selectivity] + for (size_t i = 0; i < num_samples; i++) { + int l = dist(rng); + int u = dist(rng); + if (l > u) { + std::swap(l, u); + } + float sel = (sum(u, 0) - sum(l - 1, 0)) / sum_hist; + // assume the max&min values of the col are known + // so here preprocessing([min,max]->[-1,1]) can be done + data(i, 0) = (2 / (float)NUM_X) * l - 1; + data(i, 1) = (2 / (float)NUM_X) * u - 1; + data(i, 2) = sel; + } + matrix_eig test_lowsel_data = matrix_eig::Zero(num_tests, 3); + matrix_eig test_highsel_data = matrix_eig::Zero(num_tests, 3); + matrix_eig test_random_data = matrix_eig::Zero(num_tests, 3); + std::uniform_int_distribution dist_low(300, 999); + std::uniform_int_distribution dist_high(50, 150); + + // generate test data on the low end + for (size_t i = 0; i < num_tests; i++) { + int l = dist_low(rng); + int u = dist_low(rng); + if (l > u) { + std::swap(l, u); + } + float sel = (sum(u, 0) - sum(l - 1, 0)) / sum_hist; + test_lowsel_data(i, 0) = (2 / (float)NUM_X) * l - 1; + test_lowsel_data(i, 1) = (2 / (float)NUM_X) * u - 1; + test_lowsel_data(i, 2) = sel; + } + + // generate test data on the high end + for (size_t i = 0; i < num_tests; i++) { + int l = dist_high(rng); + int u = dist_high(rng); + if (l > u) { + std::swap(l, u); + } + float sel = (sum(u, 0) - sum(l - 1, 0)) / sum_hist; + test_highsel_data(i, 0) = (2 / (float)NUM_X) * l - 1; + test_highsel_data(i, 1) = (2 / (float)NUM_X) * u - 1; + test_highsel_data(i, 2) = sel; + } + + // generate test data randomly + for (size_t i = 0; i < num_tests; i++) { + int l = dist(rng); + int u = dist(rng); + if (l > u) { + std::swap(l, u); + } + float sel = (sum(u, 0) - sum(l - 1, 0)) / sum_hist; + test_random_data(i, 0) = (2 / (float)NUM_X) * l - 1; + test_random_data(i, 1) = (2 / (float)NUM_X) * u - 1; + test_random_data(i, 2) = sel; + } + + std::vector data_vec = {data, test_highsel_data, + test_lowsel_data, test_random_data}; + data = peloton::brain::EigenUtil::VStack(data_vec); + break; + } + } + return data; +} + + +} // namespace test +} // namespace peloton diff --git a/test/brain/testing_forecast_util.cpp b/test/brain/testing_forecast_util.cpp index a27e513bb14..d5a37b60f74 100644 --- a/test/brain/testing_forecast_util.cpp +++ b/test/brain/testing_forecast_util.cpp @@ -115,291 +115,5 @@ matrix_eig TestingForecastUtil::GetWorkload(WorkloadType w, size_t num_samples, } -void TestingAugmentedNNUtil::Test( - brain::AugmentedNN &model, DistributionType d, - size_t val_interval, size_t num_samples, - float val_split, bool normalize, float val_loss_thresh, - size_t early_stop_patience, float early_stop_delta) { - LOG_INFO("Using Model: %s", model.ToString().c_str()); - size_t num_tests = model.GetBatchsize(); - matrix_eig all_data = GetData(d, num_samples, num_tests); - - matrix_eig test_data = all_data.bottomRows(num_tests*3); - matrix_eig data = all_data.topRows(all_data.rows() - num_tests*3); - - brain::Normalizer n(normalize); - val_interval = std::min(val_interval, model.GetEpochs()); - - // Determine the split point - size_t split_point = - data.rows() - static_cast(data.rows() * val_split); - - // Split into train/test data - matrix_eig train_data = data.topRows(split_point); - n.Fit(train_data); - train_data = n.Transform(train_data); - matrix_eig validate_data = - n.Transform(data.bottomRows( - static_cast(data.rows() - split_point))); - - vector_eig train_loss_avg = vector_eig::Zero(val_interval); - float prev_train_loss = std::numeric_limits::max(); - float val_loss = val_loss_thresh * 2; - std::vector val_losses; - for (int epoch = 1; epoch <= model.GetEpochs() && - !brain::ModelUtil::EarlyStop( - val_losses, early_stop_patience, early_stop_delta); - epoch++) { - auto train_loss = model.TrainEpoch(train_data); - size_t idx = (epoch - 1) % val_interval; - train_loss_avg(idx) = train_loss; - if (epoch % val_interval == 0) { - val_loss = model.ValidateEpoch(validate_data); - train_loss = train_loss_avg.mean(); - EXPECT_LE(train_loss, prev_train_loss); - LOG_DEBUG("Train Loss: %.10f, Valid Loss: %.10f", train_loss, val_loss); - prev_train_loss = train_loss; - } - } - EXPECT_LE(val_loss, val_loss_thresh); - - matrix_eig check_data = - test_data.block(0, 0, test_data.rows(), test_data.cols() - 1); - matrix_eig check_target_data = - test_data.block(0, test_data.cols() - 1, test_data.rows(), 1); - - matrix_eig test_res = model.Predict(check_data, num_tests*3); - - LOG_INFO("Test with on high end: "); - for (size_t i = 0; i < 10; i++) { - LOG_INFO("Truth: %.8f, Pred: %.8f", - check_target_data(i,0), test_res(i,0)); - } - float test_loss = peloton::brain::ModelUtil::MeanSqError( - check_target_data.topRows(num_tests), - test_res.topRows(num_tests)); - LOG_INFO("AMSE: %.8f", test_loss); - - LOG_INFO("Test with on low end: "); - for (size_t i = num_tests; i < num_tests + 10; i++) { - LOG_INFO("Truth: %.8f, Pred: %.8f", - check_target_data(i,0), test_res(i,0)); - } - test_loss = peloton::brain::ModelUtil::MeanSqError( - check_target_data.middleRows(num_tests, num_tests), - test_res.middleRows(num_tests, num_tests)); - LOG_INFO("AMSE: %.8f", test_loss); - - LOG_INFO("Test randomly: "); - for (size_t i = 2 * num_tests; i < 2 * num_tests + 10; i++) { - LOG_INFO("Truth: %.8f, Pred: %.8f", - check_target_data(i,0), test_res(i,0)); - } - test_loss = peloton::brain::ModelUtil::MeanSqError( - check_target_data.bottomRows(num_tests), - test_res.bottomRows(num_tests)); - LOG_INFO("AMSE: %.8f", test_loss); - -} - - -matrix_eig TestingAugmentedNNUtil::GetData(DistributionType d, - size_t num_samples, - size_t num_tests) { - matrix_eig data; - switch (d) { - case DistributionType::UniformDistribution: { - int NUM_X = 1000; - matrix_eig hist = matrix_eig::Zero(NUM_X + 1, 1); - matrix_eig sum = matrix_eig::Zero(NUM_X + 1, 1); - float sum_hist = 0; - for (int i = 1; i <= NUM_X; i++) { - hist(i, 0) = 100; - } - - for (int i = 1; i <= NUM_X; i++) { - sum(i, 0) = sum(i - 1, 0) + hist(i, 0); - } - sum_hist = sum(NUM_X, 0); - - // generate training and validating data randomly - data = matrix_eig::Zero(num_samples, 3); //3:lowerbound, upperbound, sel - std::mt19937 rng; - rng.seed(std::random_device()()); - std::uniform_int_distribution dist(1, NUM_X); - - // data: [lowerbound, upperbound, truth selectivity] - for (size_t i = 0; i < num_samples; i++) { - int l = dist(rng); - int u = dist(rng); - if (l > u) { - std::swap(l, u); - } - float sel = (sum(u, 0) - sum(l - 1, 0)) / sum_hist; - // assume the max&min values of the col are known - // so here preprocessing([min,max]->[-1,1]) can be done - data(i, 0) = (2 / (float)NUM_X) * l - 1; - data(i, 1) = (2 / (float)NUM_X) * u - 1; - data(i, 2) = sel; - } - - float HIGH_SEL = 0.8; - float LOW_SEL = 0.2; - - matrix_eig test_random_data = matrix_eig::Zero(num_tests, 3); - matrix_eig test_highsel_data = matrix_eig::Zero(num_tests, 3); - matrix_eig test_lowsel_data = matrix_eig::Zero(num_tests, 3); - - // generate test data with high selectivity - for (size_t i = 0; i < num_tests; i++) { - int l, u; - float sel; - do { - l = dist(rng); - u = dist(rng); - if (l > u) { - std::swap(l, u); - } - sel = (sum(u, 0) - sum(l - 1, 0)) / sum_hist; - } while(sel <= HIGH_SEL); - test_highsel_data(i, 0) = (2 / (float)NUM_X) * l - 1; - test_highsel_data(i, 1) = (2 / (float)NUM_X) * u - 1; - test_highsel_data(i, 2) = sel; - } - - // generate test data with low selectivity - for (size_t i = 0; i < num_tests; i++) { - int l, u; - float sel; - do { - l = dist(rng); - u = dist(rng); - if (l > u) { - std::swap(l, u); - } - sel = (sum(u, 0) - sum(l - 1, 0)) / sum_hist; - } while(sel >= LOW_SEL); - test_lowsel_data(i, 0) = (2 / (float)NUM_X) * l - 1; - test_lowsel_data(i, 1) = (2 / (float)NUM_X) * u - 1; - test_lowsel_data(i, 2) = sel; - } - - // generate test data with random selectivity - for (size_t i = 0; i < num_tests; i++) { - int l = dist(rng); - int u = dist(rng); - if (l > u) { - std::swap(l, u); - } - float sel = (sum(u, 0) - sum(l - 1,0)) / sum_hist; - test_random_data(i, 0) = (2 / (float)NUM_X) * l - 1; - test_random_data(i, 1) = (2 / (float)NUM_X) * u - 1; - test_random_data(i, 2) = sel; - } - - std::vector data_vec = {data, test_highsel_data, - test_lowsel_data, test_random_data}; - data = peloton::brain::EigenUtil::VStack(data_vec); - - break; - } - case DistributionType::SkewedDistribution: { - // generate skewed dataset - int NUM_X = 1000; - matrix_eig hist = matrix_eig::Zero(NUM_X + 1, 1); - matrix_eig sum = matrix_eig::Zero(NUM_X + 1, 1); - float sum_hist = 0; - - // skewed - for (int i = 1; i < 100; i++) { - hist(i, 0) = 2 + std::round(100 * - std::exp(-0.001 * std::pow(i - 100.0, 2))); - } - for (int i = 100; i <= NUM_X; i++) { - hist(i, 0) = 2 + std::round(100 * - std::exp(-0.00008 * std::pow(i - 100.0, 2))); - } - - for (int i = 1; i <= NUM_X; i++) { - sum(i, 0) = sum(i - 1, 0) + hist(i, 0); - } - sum_hist = sum(NUM_X, 0); - - // generate training and testing data randomly - data = matrix_eig::Zero(num_samples, 3); - std::mt19937 rng; - rng.seed(std::random_device()()); - std::uniform_int_distribution dist(1, NUM_X); - - // data: [lowerbound, upperbound, truth selectivity] - for (size_t i = 0; i < num_samples; i++) { - int l = dist(rng); - int u = dist(rng); - if (l > u) { - std::swap(l, u); - } - float sel = (sum(u, 0) - sum(l - 1, 0)) / sum_hist; - // assume the max&min values of the col are known - // so here preprocessing([min,max]->[-1,1]) can be done - data(i, 0) = (2 / (float)NUM_X) * l - 1; - data(i, 1) = (2 / (float)NUM_X) * u - 1; - data(i, 2) = sel; - } - matrix_eig test_lowsel_data = matrix_eig::Zero(num_tests, 3); - matrix_eig test_highsel_data = matrix_eig::Zero(num_tests, 3); - matrix_eig test_random_data = matrix_eig::Zero(num_tests, 3); - std::uniform_int_distribution dist_low(300, 999); - std::uniform_int_distribution dist_high(50, 150); - - // generate test data on the low end - for (size_t i = 0; i < num_tests; i++) { - int l = dist_low(rng); - int u = dist_low(rng); - if (l > u) { - std::swap(l, u); - } - float sel = (sum(u, 0) - sum(l - 1, 0)) / sum_hist; - test_lowsel_data(i, 0) = (2 / (float)NUM_X) * l - 1; - test_lowsel_data(i, 1) = (2 / (float)NUM_X) * u - 1; - test_lowsel_data(i, 2) = sel; - } - - // generate test data on the high end - for (size_t i = 0; i < num_tests; i++) { - int l = dist_high(rng); - int u = dist_high(rng); - if (l > u) { - std::swap(l, u); - } - float sel = (sum(u, 0) - sum(l - 1, 0)) / sum_hist; - test_highsel_data(i, 0) = (2 / (float)NUM_X) * l - 1; - test_highsel_data(i, 1) = (2 / (float)NUM_X) * u - 1; - test_highsel_data(i, 2) = sel; - } - - // generate test data randomly - for (size_t i = 0; i < num_tests; i++) { - int l = dist(rng); - int u = dist(rng); - if (l > u) { - std::swap(l, u); - } - float sel = (sum(u, 0) - sum(l - 1, 0)) / sum_hist; - test_random_data(i, 0) = (2 / (float)NUM_X) * l - 1; - test_random_data(i, 1) = (2 / (float)NUM_X) * u - 1; - test_random_data(i, 2) = sel; - } - - std::vector data_vec = {data, test_highsel_data, - test_lowsel_data, test_random_data}; - data = peloton::brain::EigenUtil::VStack(data_vec); - break; - } - } - return data; -} - - - } // namespace test } // namespace peloton diff --git a/test/include/brain/testing_augmented_nn_util.h b/test/include/brain/testing_augmented_nn_util.h new file mode 100644 index 00000000000..0c45d67fc7a --- /dev/null +++ b/test/include/brain/testing_augmented_nn_util.h @@ -0,0 +1,30 @@ +#pragma once + +#include "common/internal_types.h" +#include "brain/util/eigen_util.h" +#include "brain/workload/base_tf.h" +#include "brain/selectivity/augmented_nn.h" + +namespace peloton{ +namespace test{ + +enum class DistributionType{ UniformDistribution, SkewedDistribution }; + +class TestingAugmentedNNUtil{ + public: + static void Test(brain::AugmentedNN& model, + DistributionType d, size_t val_interval, + size_t num_samples = 1000, + float val_split = 0.5, + bool normalize = false, + float val_loss_thresh = 0.06, + size_t early_stop_patience = 10, + float early_stop_delta = 0.01); +// private: + static matrix_eig GetData(DistributionType d, + size_t num_samples, size_t num_tests); +}; + + +} +} diff --git a/test/include/brain/testing_forecast_util.h b/test/include/brain/testing_forecast_util.h index 0b37f3740ac..01750612a9c 100644 --- a/test/include/brain/testing_forecast_util.h +++ b/test/include/brain/testing_forecast_util.h @@ -4,7 +4,6 @@ #include "brain/util/eigen_util.h" #include "brain/workload/base_tf.h" #include "brain/workload/ensemble_model.h" -#include "brain/workload/augmentedNN.h" namespace peloton{ namespace test{ @@ -35,23 +34,6 @@ class TestingForecastUtil{ size_t num_feats); }; -enum class DistributionType{ UniformDistribution, SkewedDistribution }; - -class TestingAugmentedNNUtil{ - public: - static void Test(brain::AugmentedNN& model, - DistributionType d, size_t val_interval, - size_t num_samples = 1000, - float val_split = 0.5, - bool normalize = false, - float val_loss_thresh = 0.06, - size_t early_stop_patience = 10, - float early_stop_delta = 0.01); -// private: - static matrix_eig GetData(DistributionType d, - size_t num_samples, size_t num_tests); -}; - } }