Skip to content

Commit

Permalink
[Lib][ML] Replace std::vector usage with Vector
Browse files Browse the repository at this point in the history
Fix issue #115

1. Change parameters to Vector<T, false>
2. Modify FeatureLabel and unify data and gradient vector type
3. LinearRegression and LogisticRegression object can now be properly moved / copied.
  • Loading branch information
TatianaJin committed Nov 8, 2016
1 parent 313a83d commit 7766d44
Show file tree
Hide file tree
Showing 17 changed files with 476 additions and 818 deletions.
33 changes: 16 additions & 17 deletions examples/linear_regression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,34 +48,37 @@
#include "lib/ml/scaler.hpp"
#include "lib/ml/sgd.hpp"

using husky::lib::ml::SparseFeatureLabel;
using husky::lib::ml::ParameterBucket;
using SparseFeatureLabel = husky::lib::ml::FeatureLabel<double, double, false>;

void report(std::string msg) { if (husky::Context::get_global_tid() == 0) husky::base::log_msg(msg); }
void linear_regression() {
auto & train_set = husky::ObjListFactory::create_objlist<SparseFeatureLabel>("train_set");
auto & test_set = husky::ObjListFactory::create_objlist<SparseFeatureLabel>("test_set");

// load data
husky::lib::ml::DataLoader<SparseFeatureLabel> data_loader(husky::lib::ml::kTSVFormat);
data_loader.load_info(husky::Context::get_param("train"), train_set);
data_loader.load_info(husky::Context::get_param("test"), test_set);
int num_features = data_loader.get_num_feature();
auto& format = husky::lib::ml::kTSVFormat;
// int num_features = std::stoi(husky::Context::get_param("num_features"));
int num_features = husky::lib::ml::load_data(husky::Context::get_param("train"), train_set, format);
husky::lib::ml::load_data(husky::Context::get_param("test"), test_set, format, num_features);

// scale values to [-1, 1]
// TODO(Tatiana): inconsistent scaling for train and test set may be problematic
husky::lib::ml::LinearScaler<> scaler_train(num_features);
husky::lib::ml::LinearScaler<> scaler_test(num_features);
// TODO(Tatiana): apply the same scaling result in large error?
// husky::lib::ml::LinearScaler<double, double, false> scaler(num_features);
// scaler.fit_transform(train_set, test_set);
husky::lib::ml::LinearScaler<double, double, false> scaler_train(num_features);
husky::lib::ml::LinearScaler<double, double, false> scaler_test(num_features);
scaler_train.fit_transform(train_set);
scaler_test.fit_transform(test_set);

double alpha = std::stod(husky::Context::get_param("alpha"));
int num_iter = std::stoi(husky::Context::get_param("n_iter"));

// initialize linear regression model
husky::lib::ml::LinearRegression<SparseFeatureLabel, ParameterBucket<double>> lr(num_features);
husky::lib::ml::LinearRegression<double, double, false, ParameterBucket<double>> lr(num_features);

lr.report_per_round = true; // report training error per round
lr.train<husky::lib::ml::SGD<SparseFeatureLabel, ParameterBucket<double>>>(train_set, num_iter, alpha);
lr.train<husky::lib::ml::SGD>(train_set, num_iter, alpha);

report("The error on training set = " + std::to_string(lr.avg_error(train_set)));
report("The score on training set = " + std::to_string(lr.score(train_set)));
Expand All @@ -85,13 +88,9 @@ void linear_regression() {
}

int main(int argc, char** argv) {
std::vector<std::string> args;
args.push_back("hdfs_namenode");
args.push_back("hdfs_namenode_port");
args.push_back("train");
args.push_back("test");
args.push_back("n_iter");
args.push_back("alpha");
std::vector<std::string> args({
"hdfs_namenode", "hdfs_namenode_port", "train", "test", "n_iter", "alpha", //"num_features"
});
if (husky::init_with_args(argc, argv, args)) {
husky::run_job(linear_regression);
return 0;
Expand Down
37 changes: 22 additions & 15 deletions examples/logistic_regression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,30 +47,37 @@
#include "lib/ml/fgd.hpp"
#include "lib/ml/logistic_regression.hpp"

using husky::lib::ml::SparseFeatureLabel;
using husky::lib::ml::ParameterBucket;
using SparseFeatureLabel = husky::lib::ml::FeatureLabel<double, double, true>;

void logistic_regression() {
auto & train_set = husky::ObjListFactory::create_objlist<SparseFeatureLabel>("train_set");
auto & test_set = husky::ObjListFactory::create_objlist<SparseFeatureLabel>("test_set");

// load data
husky::lib::ml::DataLoader<SparseFeatureLabel> data_loader(husky::lib::ml::kLIBSVMFormat);
data_loader.load_info(husky::Context::get_param("train"), train_set);
data_loader.load_info(husky::Context::get_param("test"), test_set);
int num_features = data_loader.get_num_feature();
auto& format = husky::lib::ml::kLIBSVMFormat;
int num_features = husky::lib::ml::load_data(husky::Context::get_param("train"), train_set, format);
husky::lib::ml::load_data(husky::Context::get_param("test"), test_set, format, num_features);

husky::list_execute(train_set, [](auto& this_obj) {
if (this_obj.y < 0) this_obj.y = 0;
});
husky::list_execute(test_set, [](auto& this_obj) {
if (this_obj.y < 0) this_obj.y = 0;
});

double alpha = std::stod(husky::Context::get_param("alpha"));
int num_iter = std::stoi(husky::Context::get_param("n_iter"));

// initialize logistic regression model
husky::lib::ml::LogisticRegression<SparseFeatureLabel, ParameterBucket<double>> lr(num_features);
husky::lib::ml::LogisticRegression<double, double, true, ParameterBucket<double>> lr(num_features);
lr.report_per_round = true; // report training error per round

// train the model
lr.train<husky::lib::ml::FGD<SparseFeatureLabel, ParameterBucket<double>>>(train_set, num_iter, alpha);
lr.train<husky::lib::ml::FGD>(train_set, num_iter, alpha);

// estimate generalization error
double test_error = lr.avg_error(test_set);
auto test_error = lr.avg_error(test_set);
if (husky::Context::get_global_tid() == 0) {
// lr.present_param();
// validation
Expand All @@ -79,13 +86,13 @@ void logistic_regression() {
}

int main(int argc, char** argv) {
std::vector<std::string> args;
args.push_back("hdfs_namenode");
args.push_back("hdfs_namenode_port");
args.push_back("train");
args.push_back("test");
args.push_back("n_iter");
args.push_back("alpha");
std::vector<std::string> args(6);
args[0] = "hdfs_namenode";
args[1] = "hdfs_namenode_port";
args[2] = "train";
args[3] = "test";
args[4] = "n_iter";
args[5] = "alpha";
if (husky::init_with_args(argc, argv, args)) {
husky::run_job(logistic_regression);
return 0;
Expand Down
82 changes: 31 additions & 51 deletions examples/svm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@
//
// train
// type: string
// info: the path of training data in hadoop
// info: the path of training data in hadoop, in LibSVM format
//
// test
// type: string
// info: the path of testing data in hadoop
// info: the path of testing data in hadoop, in LibSVM format
//
// n_iter
// type: int
Expand All @@ -49,41 +49,31 @@
#include "lib/ml/data_loader.hpp"
#include "lib/ml/feature_label.hpp"
#include "lib/ml/parameter.hpp"
#include "lib/ml/vector_linalg.hpp"

using husky::lib::Aggregator;
using husky::lib::AggregatorFactory;

namespace husky {
namespace lib {
namespace ml {

typedef SparseFeatureLabel ObjT;

// how to get label and feature from data object
double get_y_(ObjT& this_obj) { return this_obj.get_label(); }
std::vector<std::pair<int, double>> get_X_(ObjT& this_obj) { return this_obj.get_feature(); }
using ObjT = husky::lib::ml::FeatureLabel<double, double, true>;

void svm() {
auto& train_set = husky::ObjListFactory::create_objlist<SparseFeatureLabel>("train_set");
auto& test_set = husky::ObjListFactory::create_objlist<SparseFeatureLabel>("test_set");
auto& train_set = husky::ObjListFactory::create_objlist<ObjT>("train_set");
auto& test_set = husky::ObjListFactory::create_objlist<ObjT>("test_set");
auto& format = husky::lib::ml::kLIBSVMFormat;

// load data
DataLoader<SparseFeatureLabel> data_loader(kLIBSVMFormat);
data_loader.load_info(husky::Context::get_param("train"), train_set);
data_loader.load_info(husky::Context::get_param("test"), test_set);
int num_features = data_loader.get_num_feature();
int num_features = husky::lib::ml::load_data(husky::Context::get_param("train"), train_set, format);
husky::lib::ml::load_data(husky::Context::get_param("test"), test_set, format);

// get model config parameters
double lambda = std::stod(husky::Context::get_param("lambda"));
int num_iter = std::stoi(husky::Context::get_param("n_iter"));

// initialize parameters
ParameterBucket<double> param_list(num_features + 1); // scalar b and vector w
husky::lib::ml::ParameterBucket<double> param_list(num_features + 1); // scalar b and vector w

if (husky::Context::get_global_tid() == 0) {
husky::base::log_msg("num of params: " + std::to_string(param_list.get_num_param()));
}

// get the number of global records
Aggregator<int> num_samples_agg(0, [](int& a, const int& b) { a += b; });
num_samples_agg.update(train_set.get_size());
Expand All @@ -107,13 +97,13 @@ void svm() {
double regulator = 0.0; // prevent overfitting

// calculate w square
for (int idx = 1; idx <= num_features; idx++) {
for (int idx = 0; idx < num_features; idx++) {
double w = param_list.param_at(idx);
sqr_w += w * w;
}

// get local copy of parameters
std::vector<double> bweight = param_list.get_all_param();
auto bweight = param_list.get_all_param();

// calculate regulator
regulator = (sqr_w == 0) ? 1.0 : std::min(1.0, 1.0 / sqrt(sqr_w * lambda));
Expand All @@ -126,7 +116,7 @@ void svm() {

// regularize w in param_list
if (husky::Context::get_global_tid() == 0) {
for (int idx = 1; idx < bweight.size(); idx++) {
for (int idx = 0; idx < num_features; idx++) {
double w = bweight[idx];
param_list.update(idx, (w - w / regulator - eta * w));
}
Expand All @@ -136,21 +126,22 @@ void svm() {
// calculate gradient
list_execute(train_set, {}, {&ac}, [&](ObjT& this_obj) {
double prod = 0; // prod = WX * y
double y = get_y_(this_obj);
std::vector<std::pair<int, double>> X = get_X_(this_obj);
for (auto& x : X)
prod += bweight[x.first] * x.second;
double y = this_obj.y;
auto X = this_obj.x;
for (auto it = X.begin_feaval(); it != X.end_feaval(); ++it)
prod += bweight[(*it).fea] * (*it).val;
// bias
prod += bweight[0];
prod += bweight[num_features];
prod *= y;

if (prod < 1) { // the data point falls within the margin
for (auto& x : X) {
x.second *= y; // calculate the gradient for each parameter
param_list.update(x.first, eta * x.second / num_samples / lambda);
for (auto it = X.begin_feaval(); it != X.end_feaval(); it++) {
auto x = *it;
x.val *= y; // calculate the gradient for each parameter
param_list.update(x.fea, eta * x.val / num_samples / lambda);
}
// update bias
param_list.update(0, eta * y / num_samples);
param_list.update(num_features, eta * y / num_samples);
loss_agg.update(1 - prod);
}
sqr_w_agg.update(sqr_w);
Expand All @@ -172,8 +163,7 @@ void svm() {
// Show result
if (husky::Context::get_global_tid() == 0) {
param_list.present();
husky::base::log_msg(
"Time per iter: " +
husky::base::log_msg("Time per iter: " +
std::to_string(std::chrono::duration_cast<std::chrono::duration<float>>(end - start).count() / num_iter));
}

Expand All @@ -184,12 +174,12 @@ void svm() {
auto bweight = param_list.get_all_param();
list_execute(test_set, {}, {&ac}, [&](ObjT& this_obj) {
double indicator = 0;
double y = get_y_(this_obj);
std::vector<std::pair<int, double>> X = get_X_(this_obj);
for (auto& x : X)
indicator += bweight[x.first] * x.second;
auto y = this_obj.y;
auto X = this_obj.x;
for (auto it = X.begin_feaval(); it != X.end_feaval(); it++)
indicator += bweight[(*it).fea] * (*it).val;
// bias
indicator += bweight[0];
indicator += bweight[num_features];
indicator *= y; // right prediction if positive (Wx+b and y have the same sign)
if (indicator < 0) error_agg.update(1);
num_test_agg.update(1);
Expand All @@ -201,20 +191,10 @@ void svm() {
}
}

} // namespace ml
} // namespace lib
} // namespace husky

int main(int argc, char** argv) {
std::vector<std::string> args;
args.push_back("hdfs_namenode");
args.push_back("hdfs_namenode_port");
args.push_back("train");
args.push_back("test");
args.push_back("n_iter");
args.push_back("lambda");
std::vector<std::string> args({"hdfs_namenode", "hdfs_namenode_port", "train", "test", "n_iter", "lambda"});
if (husky::init_with_args(argc, argv, args)) {
husky::run_job(husky::lib::ml::svm);
husky::run_job(svm);
return 0;
}
return 1;
Expand Down
2 changes: 0 additions & 2 deletions lib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,4 @@ set(lib-objs $<TARGET_OBJECTS:aggregator-objs>)
# Visible to parent directory
set(lib-objs ${lib-objs} PARENT_SCOPE)

add_subdirectory(ml)

add_library(husky-lib STATIC ${lib-objs} ${ml-objs})
25 changes: 0 additions & 25 deletions lib/ml/CMakeLists.txt

This file was deleted.

Loading

0 comments on commit 7766d44

Please sign in to comment.