Skip to content

Commit

Permalink
[Lib][ML] Replace std::vector usage with Vector (#116)
Browse files Browse the repository at this point in the history
Fix issue #115

1. Change parameters to Vector<T, false>
2. Modify FeatureLabel and unify data and gradient vector type
3. LinearRegression and LogisticRegression object can now be properly moved / copied.
  • Loading branch information
TatianaJin authored and zzxx-husky committed Nov 28, 2016
1 parent 9afedca commit a1624e9
Show file tree
Hide file tree
Showing 17 changed files with 629 additions and 876 deletions.
77 changes: 49 additions & 28 deletions examples/linear_regression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,21 @@
// info: (a.k.a number of epoch)
// How many time the entire training data will be went through
//
// is_sparse
// type: string
// info: whether the data is dense or sparse
//
// format
// type: string
// info: the data format of input file: libsvm/tsv
//
// Configuration example:
// train:/datasets/regression/MSD/train
// test:/datasets/regression/MSD/test
// n_iter:10
// alpha:0.1
// train=hdfs:///datasets/regression/MSD/train
// test=hdfs:///datasets/regression/MSD/test
// is_sparse=false
// format=tsv
// n_iter=10
// alpha=0.1

#include <string>
#include <vector>
Expand All @@ -48,34 +58,32 @@
#include "lib/ml/scaler.hpp"
#include "lib/ml/sgd.hpp"

using husky::lib::ml::SparseFeatureLabel;
using husky::lib::ml::ParameterBucket;

void report(std::string msg) { if (husky::Context::get_global_tid() == 0) husky::base::log_msg(msg); }
void linear_regression() {
auto & train_set = husky::ObjListStore::create_objlist<SparseFeatureLabel>("train_set");
auto & test_set = husky::ObjListStore::create_objlist<SparseFeatureLabel>("test_set");

template <bool is_sparse>
void linear_regression(double alpha, int num_iter, husky::lib::ml::DataFormat format) {
typedef husky::lib::ml::LabeledPointHObj<double, double, is_sparse> LabeledPointHObj;
auto & train_set = husky::ObjListStore::create_objlist<LabeledPointHObj>("train_set");
auto & test_set = husky::ObjListStore::create_objlist<LabeledPointHObj>("test_set");

// load data
husky::lib::ml::DataLoader<SparseFeatureLabel> data_loader(husky::lib::ml::kTSVFormat);
data_loader.load_info(husky::Context::get_param("train"), train_set);
data_loader.load_info(husky::Context::get_param("test"), test_set);
int num_features = data_loader.get_num_feature();
int num_features = husky::lib::ml::load_data(husky::Context::get_param("train"), train_set, format);
husky::lib::ml::load_data(husky::Context::get_param("test"), test_set, format, num_features);

// scale values to [-1, 1]
// TODO(Tatiana): inconsistent scaling for train and test set may be problematic
husky::lib::ml::LinearScaler<> scaler_train(num_features);
husky::lib::ml::LinearScaler<> scaler_test(num_features);
// TODO(Tatiana): applying the same scaling results in large error?
husky::lib::ml::LinearScaler<double, double, is_sparse> scaler_train(num_features);
husky::lib::ml::LinearScaler<double, double, is_sparse> scaler_test(num_features);
scaler_train.fit_transform(train_set);
scaler_test.fit_transform(test_set);

double alpha = std::stod(husky::Context::get_param("alpha"));
int num_iter = std::stoi(husky::Context::get_param("n_iter"));

// initialize linear regression model
husky::lib::ml::LinearRegression<SparseFeatureLabel, ParameterBucket<double>> lr(num_features);
husky::lib::ml::LinearRegression<double, double, is_sparse, ParameterBucket<double>> lr(num_features);

lr.report_per_round = true; // report training error per round
lr.train<husky::lib::ml::SGD<SparseFeatureLabel, ParameterBucket<double>>>(train_set, num_iter, alpha);
lr.template train<husky::lib::ml::SGD>(train_set, num_iter, alpha);

report("The error on training set = " + std::to_string(lr.avg_error(train_set)));
report("The score on training set = " + std::to_string(lr.score(train_set)));
Expand All @@ -84,16 +92,29 @@ void linear_regression() {
report("The score on testing set = " + std::to_string(lr.score(test_set)));
}

void initialize() {
double alpha = std::stod(husky::Context::get_param("alpha"));
int num_iter = std::stoi(husky::Context::get_param("n_iter"));
auto format = husky::Context::get_param("format");
husky::lib::ml::DataFormat data_format;
if (format == "libsvm") {
data_format = husky::lib::ml::kLIBSVMFormat;
} else if (format == "tsv") {
data_format = husky::lib::ml::kTSVFormat;
}
if (husky::Context::get_param("is_sparse") == "true") {
linear_regression<true>(alpha, num_iter, data_format);
} else {
linear_regression<false>(alpha, num_iter, data_format);
}
}

int main(int argc, char** argv) {
std::vector<std::string> args;
args.push_back("hdfs_namenode");
args.push_back("hdfs_namenode_port");
args.push_back("train");
args.push_back("test");
args.push_back("n_iter");
args.push_back("alpha");
std::vector<std::string> args({
"hdfs_namenode", "hdfs_namenode_port", "train", "test", "n_iter", "alpha", "format", "is_sparse"
});
if (husky::init_with_args(argc, argv, args)) {
husky::run_job(linear_regression);
husky::run_job(initialize);
return 0;
}
return 1;
Expand Down
81 changes: 58 additions & 23 deletions examples/logistic_regression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,16 +28,26 @@
// type: string
// info: The path of testing data in hadoop
//
// is_sparse
// type: string
// info: whether the data is dense or sparse
//
// format
// type: string
// info: the data format of input file: libsvm/tsv
//
// n_iter
// type: int
// info: (a.k.a number of epoch)
// How many time the entire training data will be went through
//
// Configuration example:
// train:/datasets/classification/a9
// test:/datasets/classification/a9t
// n_iter:50
// alpha:0.5
// train=hdfs:///datasets/classification/a9
// test=hdfs:///datasets/classification/a9t
// is_sparse=true
// format=libsvm
// n_iter=50
// alpha=0.5

#include <string>
#include <vector>
Expand All @@ -47,47 +57,72 @@
#include "lib/ml/fgd.hpp"
#include "lib/ml/logistic_regression.hpp"

using husky::lib::ml::SparseFeatureLabel;
using husky::lib::ml::ParameterBucket;

template <bool is_sparse>
void logistic_regression() {
auto & train_set = husky::ObjListStore::create_objlist<SparseFeatureLabel>("train_set");
auto & test_set = husky::ObjListStore::create_objlist<SparseFeatureLabel>("test_set");
using LabeledPointHObj = husky::lib::ml::LabeledPointHObj<double, double, is_sparse>;
auto & train_set = husky::ObjListStore::create_objlist<LabeledPointHObj>("train_set");
auto & test_set = husky::ObjListStore::create_objlist<LabeledPointHObj>("test_set");

// load data
husky::lib::ml::DataLoader<SparseFeatureLabel> data_loader(husky::lib::ml::kLIBSVMFormat);
data_loader.load_info(husky::Context::get_param("train"), train_set);
data_loader.load_info(husky::Context::get_param("test"), test_set);
int num_features = data_loader.get_num_feature();
auto format_str = husky::Context::get_param("format");
husky::lib::ml::DataFormat format;
if (format_str == "libsvm") {
format = husky::lib::ml::kLIBSVMFormat;
} else if (format_str == "tsv") {
format = husky::lib::ml::kTSVFormat;
}

int num_features = husky::lib::ml::load_data(husky::Context::get_param("train"), train_set, format);
husky::lib::ml::load_data(husky::Context::get_param("test"), test_set, format, num_features);

// processing labels
husky::list_execute(train_set, [](auto& this_obj) {
if (this_obj.y < 0) this_obj.y = 0;
});
husky::list_execute(test_set, [](auto& this_obj) {
if (this_obj.y < 0) this_obj.y = 0;
});

double alpha = std::stod(husky::Context::get_param("alpha"));
int num_iter = std::stoi(husky::Context::get_param("n_iter"));

// initialize logistic regression model
husky::lib::ml::LogisticRegression<SparseFeatureLabel, ParameterBucket<double>> lr(num_features);
husky::lib::ml::LogisticRegression<double, double, is_sparse, ParameterBucket<double>> lr(num_features);
lr.report_per_round = true; // report training error per round

// train the model
lr.train<husky::lib::ml::FGD<SparseFeatureLabel, ParameterBucket<double>>>(train_set, num_iter, alpha);
lr.template train<husky::lib::ml::FGD>(train_set, num_iter, alpha);

// estimate generalization error
double test_error = lr.avg_error(test_set);
auto test_error = lr.avg_error(test_set);
if (husky::Context::get_global_tid() == 0) {
// lr.present_param();
// validation
husky::base::log_msg("Error on testing set: " + std::to_string(test_error));
}
}

void init() {
if (husky::Context::get_param("is_sparse") == "true") {
logistic_regression<true>();
} else {
logistic_regression<false>();
}
}

int main(int argc, char** argv) {
std::vector<std::string> args;
args.push_back("hdfs_namenode");
args.push_back("hdfs_namenode_port");
args.push_back("train");
args.push_back("test");
args.push_back("n_iter");
args.push_back("alpha");
std::vector<std::string> args(8);
args[0] = "hdfs_namenode";
args[1] = "hdfs_namenode_port";
args[2] = "train";
args[3] = "test";
args[4] = "n_iter";
args[5] = "alpha";
args[6] = "format";
args[7] = "is_sparse";
if (husky::init_with_args(argc, argv, args)) {
husky::run_job(logistic_regression);
husky::run_job(init);
return 0;
}
return 1;
Expand Down
Loading

0 comments on commit a1624e9

Please sign in to comment.