[Lib][ML] Replace std::vector usage with Vector

Fix issue #115 1. Change parameters to Vector<T, false> 2. Modify FeatureLabel and unify data and gradient vector type
husky-team · Nov 7, 2016 · 6b6c7dd · 6b6c7dd
1 parent dc1474b
commit 6b6c7dd
Show file tree

Hide file tree

Showing 17 changed files with 432 additions and 754 deletions.
diff --git a/examples/linear_regression.cpp b/examples/linear_regression.cpp
@@ -48,34 +48,36 @@
 #include "lib/ml/scaler.hpp"
 #include "lib/ml/sgd.hpp"
 
-using husky::lib::ml::SparseFeatureLabel;
 using husky::lib::ml::ParameterBucket;
+using SparseFeatureLabel = husky::lib::ml::FeatureLabel<double, double, false>;
 
 void report(std::string msg) { if (husky::Context::get_global_tid() == 0) husky::base::log_msg(msg); }
 void linear_regression() {
     auto & train_set = husky::ObjListFactory::create_objlist<SparseFeatureLabel>("train_set");
     auto & test_set = husky::ObjListFactory::create_objlist<SparseFeatureLabel>("test_set");
 
     // load data
-    husky::lib::ml::DataLoader<SparseFeatureLabel> data_loader(husky::lib::ml::kTSVFormat);
-    data_loader.load_info(husky::Context::get_param("train"), train_set);
-    data_loader.load_info(husky::Context::get_param("test"), test_set);
-    int num_features = data_loader.get_num_feature();
+    auto& format = husky::lib::ml::kTSVFormat;
+    // int num_features = std::stoi(husky::Context::get_param("num_features"));
+    int num_features = husky::lib::ml::load_data(husky::Context::get_param("train"), train_set, format);
+    husky::lib::ml::load_data(husky::Context::get_param("test"), test_set, format, num_features);
 
     // scale values to [-1, 1]
-    // TODO(Tatiana): inconsistent scaling for train and test set may be problematic
-    husky::lib::ml::LinearScaler<> scaler_train(num_features);
-    husky::lib::ml::LinearScaler<> scaler_test(num_features);
+    // TODO(Tatiana): apply the same scaling result in large error?
+    // husky::lib::ml::LinearScaler<double, double, false> scaler(num_features);
+    // scaler.fit_transform(train_set, test_set);
+    husky::lib::ml::LinearScaler<double, double, false> scaler_train(num_features);
+    husky::lib::ml::LinearScaler<double, double, false> scaler_test(num_features);
     scaler_train.fit_transform(train_set);
     scaler_test.fit_transform(test_set);
 
     double alpha = std::stod(husky::Context::get_param("alpha"));
     int num_iter = std::stoi(husky::Context::get_param("n_iter"));
 
     // initialize linear regression model
-    husky::lib::ml::LinearRegression<SparseFeatureLabel, ParameterBucket<double>> lr(num_features);
+    husky::lib::ml::LinearRegression<double, double, false, ParameterBucket<double>> lr(num_features);
     lr.report_per_round = true;  // report training error per round
-    lr.train<husky::lib::ml::SGD<SparseFeatureLabel, ParameterBucket<double>>>(train_set, num_iter, alpha);
+    lr.train<husky::lib::ml::SGD>(train_set, num_iter, alpha);
 
     report("The error on training set = " + std::to_string(lr.avg_error(train_set)));
     report("The score on training set = " + std::to_string(lr.score(train_set)));
@@ -85,13 +87,9 @@ void linear_regression() {
 }
 
 int main(int argc, char** argv) {
-    std::vector<std::string> args;
-    args.push_back("hdfs_namenode");
-    args.push_back("hdfs_namenode_port");
-    args.push_back("train");
-    args.push_back("test");
-    args.push_back("n_iter");
-    args.push_back("alpha");
+    std::vector<std::string> args({
+        "hdfs_namenode", "hdfs_namenode_port", "train", "test", "n_iter", "alpha",  //"num_features"
+    });
     if (husky::init_with_args(argc, argv, args)) {
         husky::run_job(linear_regression);
         return 0;

diff --git a/examples/logistic_regression.cpp b/examples/logistic_regression.cpp
@@ -47,30 +47,37 @@
 #include "lib/ml/fgd.hpp"
 #include "lib/ml/logistic_regression.hpp"
 
-using husky::lib::ml::SparseFeatureLabel;
 using husky::lib::ml::ParameterBucket;
+using SparseFeatureLabel = husky::lib::ml::FeatureLabel<double, double, true>;
+
 void logistic_regression() {
     auto & train_set = husky::ObjListFactory::create_objlist<SparseFeatureLabel>("train_set");
     auto & test_set = husky::ObjListFactory::create_objlist<SparseFeatureLabel>("test_set");
 
     // load data
-    husky::lib::ml::DataLoader<SparseFeatureLabel> data_loader(husky::lib::ml::kLIBSVMFormat);
-    data_loader.load_info(husky::Context::get_param("train"), train_set);
-    data_loader.load_info(husky::Context::get_param("test"), test_set);
-    int num_features = data_loader.get_num_feature();
+    auto& format = husky::lib::ml::kLIBSVMFormat;
+    int num_features = husky::lib::ml::load_data(husky::Context::get_param("train"), train_set, format);
+    husky::lib::ml::load_data(husky::Context::get_param("test"), test_set, format, num_features);
+
+    husky::list_execute(train_set, [](auto& this_obj) {
+        if (this_obj.y < 0) this_obj.y = 0;
+    });
+    husky::list_execute(test_set, [](auto& this_obj) {
+        if (this_obj.y < 0) this_obj.y = 0;
+    });
 
     double alpha = std::stod(husky::Context::get_param("alpha"));
     int num_iter = std::stoi(husky::Context::get_param("n_iter"));
 
     // initialize logistic regression model
-    husky::lib::ml::LogisticRegression<SparseFeatureLabel, ParameterBucket<double>> lr(num_features);
+    husky::lib::ml::LogisticRegression<double, double, true, ParameterBucket<double>> lr(num_features);
     lr.report_per_round = true;  // report training error per round
 
     // train the model
-    lr.train<husky::lib::ml::FGD<SparseFeatureLabel, ParameterBucket<double>>>(train_set, num_iter, alpha);
+    lr.train<husky::lib::ml::FGD>(train_set, num_iter, alpha);
 
     // estimate generalization error
-    double test_error = lr.avg_error(test_set);
+    auto test_error = lr.avg_error(test_set);
     if (husky::Context::get_global_tid() == 0) {
         // lr.present_param();
         // validation
@@ -79,13 +86,13 @@ void logistic_regression() {
 }
 
 int main(int argc, char** argv) {
-    std::vector<std::string> args;
-    args.push_back("hdfs_namenode");
-    args.push_back("hdfs_namenode_port");
-    args.push_back("train");
-    args.push_back("test");
-    args.push_back("n_iter");
-    args.push_back("alpha");
+    std::vector<std::string> args(6);
+    args[0] = "hdfs_namenode";
+    args[1] = "hdfs_namenode_port";
+    args[2] = "train";
+    args[3] = "test";
+    args[4] = "n_iter";
+    args[5] = "alpha";
     if (husky::init_with_args(argc, argv, args)) {
         husky::run_job(logistic_regression);
         return 0;

diff --git a/examples/svm.cpp b/examples/svm.cpp
@@ -21,11 +21,11 @@
 //
 // train
 // type: string
-// info: the path of training data in hadoop
+// info: the path of training data in hadoop, in LibSVM format
 //
 // test
 // type: string
-// info: the path of testing data in hadoop
+// info: the path of testing data in hadoop, in LibSVM format
 //
 // n_iter
 // type: int
@@ -49,41 +49,31 @@
 #include "lib/ml/data_loader.hpp"
 #include "lib/ml/feature_label.hpp"
 #include "lib/ml/parameter.hpp"
-#include "lib/ml/vector_linalg.hpp"
 
 using husky::lib::Aggregator;
 using husky::lib::AggregatorFactory;
-
-namespace husky {
-namespace lib {
-namespace ml {
-
-typedef SparseFeatureLabel ObjT;
-
-// how to get label and feature from data object
-double get_y_(ObjT& this_obj) { return this_obj.get_label(); }
-std::vector<std::pair<int, double>> get_X_(ObjT& this_obj) { return this_obj.get_feature(); }
+using ObjT = husky::lib::ml::FeatureLabel<double, double, true>;
 
 void svm() {
-    auto& train_set = husky::ObjListFactory::create_objlist<SparseFeatureLabel>("train_set");
-    auto& test_set = husky::ObjListFactory::create_objlist<SparseFeatureLabel>("test_set");
+    auto& train_set = husky::ObjListFactory::create_objlist<ObjT>("train_set");
+    auto& test_set = husky::ObjListFactory::create_objlist<ObjT>("test_set");
+    auto& format = husky::lib::ml::kLIBSVMFormat;
 
     // load data
-    DataLoader<SparseFeatureLabel> data_loader(kLIBSVMFormat);
-    data_loader.load_info(husky::Context::get_param("train"), train_set);
-    data_loader.load_info(husky::Context::get_param("test"), test_set);
-    int num_features = data_loader.get_num_feature();
+    int num_features = husky::lib::ml::load_data(husky::Context::get_param("train"), train_set, format);
+    husky::lib::ml::load_data(husky::Context::get_param("test"), test_set, format);
 
     // get model config parameters
     double lambda = std::stod(husky::Context::get_param("lambda"));
     int num_iter = std::stoi(husky::Context::get_param("n_iter"));
 
     // initialize parameters
-    ParameterBucket<double> param_list(num_features + 1);  // scalar b and vector w
+    husky::lib::ml::ParameterBucket<double> param_list(num_features + 1);  // scalar b and vector w
 
     if (husky::Context::get_global_tid() == 0) {
         husky::base::log_msg("num of params: " + std::to_string(param_list.get_num_param()));
     }
+
     // get the number of global records
     Aggregator<int> num_samples_agg(0, [](int& a, const int& b) { a += b; });
     num_samples_agg.update(train_set.get_size());
@@ -107,13 +97,13 @@ void svm() {
         double regulator = 0.0;  // prevent overfitting
 
         // calculate w square
-        for (int idx = 1; idx <= num_features; idx++) {
+        for (int idx = 0; idx < num_features; idx++) {
             double w = param_list.param_at(idx);
             sqr_w += w * w;
         }
 
         // get local copy of parameters
-        std::vector<double> bweight = param_list.get_all_param();
+        auto bweight = param_list.get_all_param();
 
         // calculate regulator
         regulator = (sqr_w == 0) ? 1.0 : std::min(1.0, 1.0 / sqrt(sqr_w * lambda));
@@ -126,7 +116,7 @@ void svm() {
 
         // regularize w in param_list
         if (husky::Context::get_global_tid() == 0) {
-            for (int idx = 1; idx < bweight.size(); idx++) {
+            for (int idx = 0; idx < num_features; idx++) {
                 double w = bweight[idx];
                 param_list.update(idx, (w - w / regulator - eta * w));
             }
@@ -136,21 +126,22 @@ void svm() {
         // calculate gradient
         list_execute(train_set, {}, {&ac}, [&](ObjT& this_obj) {
            double prod = 0;  // prod = WX * y
-           double y = get_y_(this_obj);
-           std::vector<std::pair<int, double>> X = get_X_(this_obj);
-           for (auto& x : X)
-               prod += bweight[x.first] * x.second;
+           double y = this_obj.y;
+           auto X = this_obj.x;
+           for (auto it = X.begin_feaval(); it != X.end_feaval(); ++it)
+               prod += bweight[(*it).fea] * (*it).val;
            // bias
-           prod += bweight[0];
+           prod += bweight[num_features];
            prod *= y;
 
            if (prod < 1) {  // the data point falls within the margin
-               for (auto& x : X) {
-                   x.second *= y;  // calculate the gradient for each parameter
-                   param_list.update(x.first, eta * x.second / num_samples / lambda);
+               for (auto it = X.begin_feaval(); it != X.end_feaval(); it++) {
+                   auto x = *it;
+                   x.val *= y;  // calculate the gradient for each parameter
+                   param_list.update(x.fea, eta * x.val / num_samples / lambda);
                }
                // update bias
-               param_list.update(0, eta * y / num_samples);
+               param_list.update(num_features, eta * y / num_samples);
                loss_agg.update(1 - prod);
            }
            sqr_w_agg.update(sqr_w);
@@ -172,8 +163,7 @@ void svm() {
     // Show result
     if (husky::Context::get_global_tid() == 0) {
         param_list.present();
-        husky::base::log_msg(
-            "Time per iter: " +
+        husky::base::log_msg("Time per iter: " +
             std::to_string(std::chrono::duration_cast<std::chrono::duration<float>>(end - start).count() / num_iter));
     }
 
@@ -184,12 +174,12 @@ void svm() {
     auto bweight = param_list.get_all_param();
     list_execute(test_set, {}, {&ac}, [&](ObjT& this_obj) {
        double indicator = 0;
-       double y = get_y_(this_obj);
-       std::vector<std::pair<int, double>> X = get_X_(this_obj);
-       for (auto& x : X)
-           indicator += bweight[x.first] * x.second;
+       auto y = this_obj.y;
+       auto X = this_obj.x;
+       for (auto it = X.begin_feaval(); it != X.end_feaval(); it++)
+           indicator += bweight[(*it).fea] * (*it).val;
        // bias
-       indicator += bweight[0];
+       indicator += bweight[num_features];
        indicator *= y;  // right prediction if positive (Wx+b and y have the same sign)
        if (indicator < 0) error_agg.update(1);
        num_test_agg.update(1);
@@ -201,20 +191,10 @@ void svm() {
     }
 }
 
-}  // namespace ml
-}  // namespace lib
-}  // namespace husky
-
 int main(int argc, char** argv) {
-    std::vector<std::string> args;
-    args.push_back("hdfs_namenode");
-    args.push_back("hdfs_namenode_port");
-    args.push_back("train");
-    args.push_back("test");
-    args.push_back("n_iter");
-    args.push_back("lambda");
+    std::vector<std::string> args({"hdfs_namenode", "hdfs_namenode_port", "train", "test", "n_iter", "lambda"});
     if (husky::init_with_args(argc, argv, args)) {
-        husky::run_job(husky::lib::ml::svm);
+        husky::run_job(svm);
         return 0;
     }
     return 1;

diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
@@ -27,6 +27,4 @@ set(lib-objs $<TARGET_OBJECTS:aggregator-objs>)
 # Visible to parent directory
 set(lib-objs ${lib-objs} PARENT_SCOPE)
 
-add_subdirectory(ml)
-
 add_library(husky-lib STATIC ${lib-objs} ${ml-objs})
diff --git a/lib/ml/CMakeLists.txt b/lib/ml/CMakeLists.txt