diff --git a/CMakeLists.txt b/CMakeLists.txt index 3990bba..0d60885 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -179,7 +179,7 @@ endif() if(TEST) add_subdirectory(test) - add_subdirectory(testpy) +# add_subdirectory(testpy) else() message("Skipping tests") endif() diff --git a/abcranger.cpp b/abcranger.cpp index 09e8312..e755e6c 100644 --- a/abcranger.cpp +++ b/abcranger.cpp @@ -64,13 +64,27 @@ int main(int argc, char* argv[]) { chosenscen = static_cast(opts["chosenscen"].as()); auto myread = readreftable_scen(headerfile, reftablefile, chosenscen, nref); - const auto statobs = readStatObs(statobsfile); + auto origobs = readStatObs(statobsfile); + size_t nstat = myread.stats_names.size(); + size_t num_samples = origobs.size() / nstat; + if (((origobs.size() % nstat) != 0) || (num_samples < 1)) { + std::cout << "wrong number of summary statistics in statobs file." << std::endl; + exit(1); + } + MatrixXd statobs = Map(origobs.data(),nstat,num_samples).transpose(); auto res = EstimParam_fun(myread, statobs, opts); } else { std::cout << "> Model Choice <" << std::endl; auto myread = readreftable(headerfile, reftablefile, nref, false, opts.count("g") > 0 ? opts["g"].as() : ""); - const auto statobs = readStatObs(statobsfile); + auto origobs = readStatObs(statobsfile); + size_t nstat = myread.stats_names.size(); + size_t num_samples = origobs.size() / nstat; + if (((origobs.size() % nstat) != 0) || (num_samples < 1)) { + std::cout << "wrong number of summary statistics in statobs file." << std::endl; + exit(1); + } + MatrixXd statobs = Map(origobs.data(),nstat,num_samples).transpose(); auto res = ModelChoice_fun(myread,statobs,opts); } diff --git a/src/EstimParam.cpp b/src/EstimParam.cpp index 64b4e0a..8ccd5e4 100644 --- a/src/EstimParam.cpp +++ b/src/EstimParam.cpp @@ -25,17 +25,17 @@ using namespace ranger; using namespace Eigen; using namespace ranges; -template +template EstimParamResults EstimParam_fun(Reftable &myread, - std::vector origobs, + MatrixXd statobs, const cxxopts::ParseResult opts, bool quiet, bool weights) { size_t nref, ntree, nthreads, noisecols, seed, minnodesize, ntest; std::string outfile, parameter_of_interest; - double chosenscen,plsmaxvar; - bool plsok,seeded; + double chosenscen, plsmaxvar; + bool plsok, seeded; ntree = opts["t"].as(); nthreads = opts["j"].as(); @@ -52,123 +52,136 @@ EstimParamResults EstimParam_fun(Reftable &myread, outfile = (opts.count("output") == 0) ? "estimparam_out" : opts["o"].as(); - double p_threshold_PLS = 0.99; - std::vector samplefract{std::min(1e5,static_cast(myread.nrec))/static_cast(myread.nrec)}; + std::vector samplefract{std::min(1e5, static_cast(myread.nrec)) / static_cast(myread.nrec)}; auto nstat = myread.stats_names.size(); - MatrixXd statobs(1, nstat); - MatrixXd emptyrow(1,0); + // MatrixXd statobs(1, nstat); + MatrixXd emptyrow(1, 0); + size_t num_samples = statobs.rows(); - statobs = Map(origobs.data(), 1, nstat); + // statobs = Map(origobs.data(), 1, nstat); - std::size_t p1,p2; + std::size_t p1, p2; op_type op; - parse_paramexpression(myread.params_names,parameter_of_interest,op, p1, p2); + parse_paramexpression(myread.params_names, parameter_of_interest, op, p1, p2); auto nparam = myread.params_names.size(); nref = myread.nrec; - ntest = std::min(nref,ntest); + ntest = std::min(nref, ntest); VectorXd y(nref); - switch(op) { - case op_type::none : - y = myread.params(all,p1); - break; - case op_type::divide : - y = myread.params(all,p1).array() / myread.params(all,p2).array(); - break; - case op_type::multiply : - y = myread.params(all,p1)*myread.params(all,p2); - break; + switch (op) + { + case op_type::none: + y = myread.params(all, p1); + break; + case op_type::divide: + y = myread.params(all, p1).array() / myread.params(all, p2).array(); + break; + case op_type::multiply: + y = myread.params(all, p1) * myread.params(all, p2); + break; } // myread.params = std::move(myread.params(indexesModel,param_num)).eval(); - if (y.array().isNaN().any()) { + if (y.array().isNaN().any()) + { std::cout << "Error : there is some nan in the parameter data." << std::endl; exit(1); } - EstimParamResults res; - MatrixXd data_extended(nref,0); - + MatrixXd data_extended(nref, 0); - if (plsok) { + if (plsok) + { size_t ncomp_total = static_cast(lround(1.0 * static_cast(nstat))); MatrixXd Projection; - RowVectorXd mean,std; + RowVectorXd mean, std; size_t nComposante_sel; - - const std::string& pls_filename = outfile + ".plsvar"; + + const std::string &pls_filename = outfile + ".plsvar"; std::ofstream pls_file; - if (!quiet) pls_file.open(pls_filename, std::ios::out); + if (!quiet) + pls_file.open(pls_filename, std::ios::out); - if (plsmaxvar == 0.0 ) { + auto validvars = filterConstantVars(myread.stats); + if (plsmaxvar == 0.0) + { std::cout << "Using elbow method for selecting PLS axes" << std::endl; - VectorXd percentYvar = pls(myread.stats, - y, - ncomp_total,Projection, mean, std,true); + VectorXd percentYvar = pls(myread.stats(all,validvars), + y, + (ncomp_total - (nstat - validvars.size())), Projection, mean, std, true); nComposante_sel = percentYvar.size(); - for (auto i = 0; i < nComposante_sel; i++) res.plsvar.push_back(percentYvar(i)); - } else { - VectorXd percentYvar = pls(myread.stats, - y, - ncomp_total,Projection, mean, std,false); + for (auto i = 0; i < nComposante_sel; i++) + res.plsvar.push_back(percentYvar(i)); + } + else + { + VectorXd percentYvar = pls(myread.stats(all,validvars), + y, + (ncomp_total - (nstat - validvars.size())), Projection, mean, std, false); auto maxres = percentYvar[percentYvar.size() - 1]; - for(auto i = 0; i < percentYvar.size(); i++) - if (percentYvar(i) <= plsmaxvar * maxres) res.plsvar.push_back(percentYvar(i)); + for (auto i = 0; i < percentYvar.size(); i++) + if (percentYvar(i) <= plsmaxvar * maxres) + res.plsvar.push_back(percentYvar(i)); nComposante_sel = res.plsvar.size(); } - //res.plsvar = std::vector(percentYvar.size()); - for(auto p: res.plsvar) - if (!quiet) { + for (auto p : res.plsvar) + if (!quiet) + { pls_file << p << std::endl; pls_file.close(); } - - if (!quiet) std::cout << "Selecting only " << nComposante_sel << " pls components." << std::endl; + if (!quiet) + std::cout << "Selecting only " << nComposante_sel << " pls components." << std::endl; double sumPlsweights = Projection.col(0).array().abs().sum(); - auto weightedPlsfirst = Projection.col(0)/sumPlsweights; + auto weightedPlsfirst = Projection.col(0) / sumPlsweights; - const std::string& plsweights_filename = outfile + ".plsweights"; + const std::string &plsweights_filename = outfile + ".plsweights"; std::ofstream plsweights_file; - if (!quiet) plsweights_file.open(plsweights_filename, std::ios::out); - for(auto& p : views::zip(myread.stats_names, weightedPlsfirst) - | to_vector - | actions::sort([](auto& a, auto& b){ return std::abs(a.second) > std::abs(b.second); })) { - if (!quiet) plsweights_file << p.first << " " << p.second << std::endl; - res.plsweights.push_back(p); - } + if (!quiet) + plsweights_file.open(plsweights_filename, std::ios::out); + for (auto &p : views::zip(myread.stats_names, weightedPlsfirst) | to_vector | actions::sort([](auto &a, auto &b) { return std::abs(a.second) > std::abs(b.second); })) + { + if (!quiet) + plsweights_file << p.first << " " << p.second << std::endl; + res.plsweights.push_back(p); + } plsweights_file.close(); - auto Xc = (myread.stats.array().rowwise()-mean.array()).rowwise()/std.array(); - addCols(data_extended,(Xc.matrix() * Projection).leftCols(nComposante_sel)); - auto Xcobs = (statobs.array().rowwise()-mean.array()).rowwise()/std.array(); - addCols(statobs,(Xcobs.matrix() * Projection).leftCols(nComposante_sel)); - for(auto i = 0; i < nComposante_sel; i++) - myread.stats_names.push_back("Comp " + std::to_string(i+1)); - + auto Xc = (myread.stats(all,validvars).array().rowwise() - mean.array()).rowwise() / std.array(); + addCols(data_extended, (Xc.matrix() * Projection).leftCols(nComposante_sel)); + auto Xcobs = (statobs(all,validvars).array().rowwise() - mean.array()).rowwise() / std.array(); + addCols(statobs, (Xcobs.matrix() * Projection).leftCols(nComposante_sel)); + for (auto i = 0; i < nComposante_sel; i++) + myread.stats_names.push_back("Comp " + std::to_string(i + 1)); } - if (!quiet) { - const std::string& settings_filename = outfile + ".settings"; + if (!quiet) + { + const std::string &settings_filename = outfile + ".settings"; std::ofstream settings_file; settings_file.open(settings_filename, std::ios::out); settings_file << "Parameter estimation analyses proceeded using: " << std::endl; - settings_file << "- " << "Parameter name: " << parameter_of_interest << std::endl; - settings_file << "- " << "Scenario " << chosenscen << std::endl; + settings_file << "- " + << "Parameter name: " << parameter_of_interest << std::endl; + settings_file << "- " + << "Scenario " << chosenscen << std::endl; settings_file << "- " << myread.nrec << " simulated datasets" << std::endl; settings_file << "- " << ntree << " trees" << std::endl; - settings_file << "- " << "Minimum node size of " << (minnodesize == 0 ? 5 : minnodesize) << std::endl; + settings_file << "- " + << "Minimum node size of " << (minnodesize == 0 ? 5 : minnodesize) << std::endl; settings_file << "- " << myread.stats.cols() << " summary statistics" << std::endl; - if (plsok) { + if (plsok) + { settings_file << "- " << data_extended.cols() << " axes of summary statistics PLS linear combination" << std::endl; } settings_file << "- " << noisecols << " noise variables" << std::endl; @@ -176,261 +189,316 @@ EstimParamResults EstimParam_fun(Reftable &myread, settings_file.close(); } - addNoise(myread, data_extended, statobs, noisecols); std::vector varwithouty = myread.stats_names; - auto datastatobs = unique_cast, Data>(std::make_unique>(statobs,emptyrow,varwithouty, 1, varwithouty.size())); - addCols(data_extended,y); + auto datastatobs = unique_cast, Data>(std::make_unique>(statobs, emptyrow, varwithouty, num_samples, varwithouty.size())); + addCols(data_extended, y); myread.stats_names.push_back("Y"); - auto datastats = unique_cast, Data>(std::make_unique>(myread.stats,data_extended,myread.stats_names, nref, myread.stats_names.size())); + auto datastats = unique_cast, Data>(std::make_unique>(myread.stats, data_extended, myread.stats_names, nref, myread.stats_names.size())); ForestOnlineRegression forestreg; - forestreg.init("Y", // dependant variable - MemoryMode::MEM_DOUBLE, // memory mode double or float - std::move(datastats), // data - std::move(datastatobs), // predict - std::max(std::floor(static_cast(myread.stats_names.size()-1)/3.0),1.0), // mtry, if 0 sqrt(m -1) but should be m/3 in regression - outfile, // output file name prefix - ntree, // number of trees - (seeded ? seed : r()), // seed rd() - nthreads, // number of threads - ImportanceMode::IMP_GINI, // Default IMP_NONE - minnodesize, // default min node size (classif = 1, regression 5) - "", // status variable name, only for survival - false, // prediction mode (true = predict) - true, // replace - std::vector(0), // unordered variables names - false, // memory_saving_splitting - DEFAULT_SPLITRULE, // gini for classif variance for regression - true, // predict_all - samplefract, // sample_fraction 1 if replace else 0.632 - DEFAULT_ALPHA, // alpha - DEFAULT_MINPROP, // miniprop - false, // holdout - DEFAULT_PREDICTIONTYPE, // prediction type - DEFAULT_NUM_RANDOM_SPLITS, // num_random_splits - false, //order_snps - DEFAULT_MAXDEPTH, - ntest); // max_depth - if (!quiet) forestreg.verbose_out = &std::cout; - forestreg.run(!quiet,true); + forestreg.init("Y", // dependant variable + MemoryMode::MEM_DOUBLE, // memory mode double or float + std::move(datastats), // data + std::move(datastatobs), // predict + std::max(std::floor(static_cast(myread.stats_names.size() - 1) / 3.0), 1.0), // mtry, if 0 sqrt(m -1) but should be m/3 in regression + outfile, // output file name prefix + ntree, // number of trees + (seeded ? seed : r()), // seed rd() + nthreads, // number of threads + ImportanceMode::IMP_GINI, // Default IMP_NONE + minnodesize, // default min node size (classif = 1, regression 5) + "", // status variable name, only for survival + false, // prediction mode (true = predict) + true, // replace + std::vector(0), // unordered variables names + false, // memory_saving_splitting + DEFAULT_SPLITRULE, // gini for classif variance for regression + true, // predict_all + samplefract, // sample_fraction 1 if replace else 0.632 + DEFAULT_ALPHA, // alpha + DEFAULT_MINPROP, // miniprop + false, // holdout + DEFAULT_PREDICTIONTYPE, // prediction type + DEFAULT_NUM_RANDOM_SPLITS, // num_random_splits + false, //order_snps + DEFAULT_MAXDEPTH, + ntest); // max_depth + if (!quiet) + forestreg.verbose_out = &std::cout; + forestreg.run(!quiet, true); auto preds = forestreg.getPredictions(); // Variable Importance res.variable_importance = forestreg.getImportance(); - if (!quiet) forestreg.writeImportanceFile(); + if (!quiet) + forestreg.writeImportanceFile(); // OOB error by number of trees res.ntree_oob_error = preds[2][0]; // res.oob_error = preds[2][0][ntree-1]; - if (!quiet) forestreg.writeOOBErrorFile(); + if (!quiet) + forestreg.writeOOBErrorFile(); // Values/weights res.values_weights = forestreg.getWeights(); - if (!quiet) forestreg.writeWeightsFile(); - + if (!quiet) + forestreg.writeWeightsFile(); -// auto dataptr2 = forestreg.releaseData(); -// auto& datareleased2 = static_cast&>(*dataptr2.get()); -// datareleased2.data.conservativeResize(NoChange,nstat); -// myread.stats = std::move(datareleased2.data); + // auto dataptr2 = forestreg.releaseData(); + // auto& datareleased2 = static_cast&>(*dataptr2.get()); + // datareleased2.data.conservativeResize(NoChange,nstat); + // myread.stats = std::move(datareleased2.data); myread.stats_names.resize(nstat); - std::vector probs{0.05,0.5,0.95}; + std::vector probs{0.05, 0.5, 0.95}; - std::vector rCI,relativeCI,lrCI,lrelativeCI; + std::vector rCI, relativeCI; std::ostringstream os; std::vector obs; - std::copy(y.begin(),y.end(),std::back_inserter(obs)); + std::copy(y.begin(), y.end(), std::back_inserter(obs)); - if (weights) res.oob_weights = MatrixXd(ntest,nref); + if (weights) + res.oob_weights = MatrixXd(ntest, nref); res.oob_map = forestreg.oob_subset; - double expectation = 0.0; - double variance = 0.0; - std::map global_local{ - { "global", "Global (prior) errors" }, - { "local", "Local (posterior) errors" } - }; - - std::map mean_median_ci{ - { "mean", "Computed from the mean taken as point estimate" }, - { "median", "Computed from the median taken as point estimate" }, - { "ci", "Confidence interval measures"} - }; - - std::vector mean_median_ci_l{"mean","median","ci"}; - - std::vector computed{"NMAE","MSE","NMSE"}; - - std::map ci{ - { "cov", "90% coverage" }, - { "meanCI", "Mean 90% CI" }, - { "meanRCI", "Mean relative 90% CI" }, - { "medianCI", "Median 90% CI" }, - { "medianRCI", "Median relative 90% CI" } - }; - - for(auto g_l : global_local) { - for(auto c: computed) { - res.errors[g_l.first]["mean"][c] = 0.0; - res.errors[g_l.first]["median"][c] = 0.0; - } - for(auto c: ci) { - if (g_l.first == "global") - res.errors[g_l.first]["ci"][c.first] = 0.0; + std::vector expectation(num_samples, 0.0); + std::vector variance(num_samples, 0.0); + std::map global_local{ + {"global", "Global (prior) errors"}, + {"local", "Local (posterior) errors"}}; + + std::map mean_median_ci{ + {"mean", "Computed from the mean taken as point estimate"}, + {"median", "Computed from the median taken as point estimate"}, + {"ci", "Confidence interval measures"}}; + + std::vector mean_median_ci_l{"mean", "median", "ci"}; + + std::vector computed{"NMAE", "MSE", "NMSE"}; + + std::map ci{ + {"cov", "90% coverage"}, + {"meanCI", "Mean 90% CI"}, + {"meanRCI", "Mean relative 90% CI"}, + {"medianCI", "Median 90% CI"}, + {"medianRCI", "Median relative 90% CI"}}; + + res.errors["local"] = std::vector>>(num_samples); + res.errors["global"] = std::vector>>(1); + + for (auto g_l : global_local) + { + for (auto c : computed) + { + for (size_t j = 0; j < num_samples; j++) + { + if (g_l.first == "local" || j == 0) + { + res.errors[g_l.first][j]["mean"][c] = 0.0; + res.errors[g_l.first][j]["median"][c] = 0.0; + } + } } + if (g_l.first == "global") + for (auto c : ci) + { + res.errors[g_l.first][0]["ci"][c.first] = 0.0; + } } // double variance2 = 0.0; - double sumw = 0.0; - std::vector quants = forestQuantiles(obs,preds[4][0],probs); + std::vector sumw(num_samples, 0.0); + std::vector> quants = forestQuantiles_b(obs, preds[4], probs); - std::vector> quants_w = forestQuantiles_b(obs,preds[5],probs); + std::vector> quants_w = forestQuantiles_b(obs, preds[5], probs); // std::mutex mutex_quant; - for(auto i = 0; i < nref; i++) { - // ThreadPool::ParallelFor(0, nref,[&](auto i){ - auto w = preds[4][0][i]; + for (auto i = 0; i < nref; i++) + { auto pred_oob = preds[0][0][i]; auto reality = y(i); - // mutex_quant.lock(); - expectation += w * reality; - if (!std::isnan(pred_oob)) { - double diff = reality - pred_oob; - auto sqdiff = diff * diff; - res.errors["global"]["mean"]["NMAE"] += std::abs(diff / reality); - res.errors["global"]["mean"]["MSE"] += sqdiff; - res.errors["global"]["mean"]["NMSE"] += sqdiff / reality; - res.errors["local"]["mean"]["NMAE"] += w * std::abs(diff / reality); - res.errors["local"]["mean"]["MSE"] += w * sqdiff; - res.errors["local"]["mean"]["NMSE"] += w * sqdiff / reality; + double diff = reality - pred_oob; + auto sqdiff = diff * diff; + // ThreadPool::ParallelFor(0, nref,[&](auto i){ + if (!std::isnan(pred_oob)) + { + res.errors["global"][0]["mean"]["NMAE"] += std::abs(diff / reality); + res.errors["global"][0]["mean"]["MSE"] += sqdiff; + res.errors["global"][0]["mean"]["NMSE"] += sqdiff / reality; + } + for (auto j = 0; j < num_samples; j++) + { + auto w = preds[4][j][i]; + // mutex_quant.lock(); + expectation[j] += w * reality; + if (!std::isnan(pred_oob)) + { + res.errors["local"][j]["mean"]["NMAE"] += w * std::abs(diff / reality); + res.errors["local"][j]["mean"]["MSE"] += w * sqdiff; + res.errors["local"][j]["mean"]["NMSE"] += w * sqdiff / reality; + } } // mutex_quant.unlock(); - if (i < ntest) { - auto p = *(std::next(forestreg.oob_subset.begin(),i)); - if (weights) - for (auto i = 0; i < nref; i++) res.oob_weights(p.second,i) = preds[5][p.second][i]; + if (i < ntest) + { + auto p = *(std::next(forestreg.oob_subset.begin(), i)); + if (weights) + for (auto i = 0; i < nref; i++) + res.oob_weights(p.second, i) = preds[5][p.second][i]; // std::vector quants_oob = forestQuantiles(obs,preds[5][p.second],probs); std::vector quants_oob = quants_w[p.second]; auto reality = y(p.first); - auto w = preds[4][0][p.first]; auto diff = quants_oob[1] - reality; auto sqdiff = diff * diff; auto CI = quants_oob[2] - quants_oob[0]; - // mutex_quant.lock(); - sumw += w; - res.errors["global"]["median"]["NMAE"] += std::abs(diff / reality); - res.errors["global"]["median"]["MSE"] += sqdiff; - res.errors["global"]["median"]["NMSE"] += sqdiff / reality; - res.errors["local"]["median"]["NMAE"] += w * std::abs(diff / reality); - res.errors["local"]["median"]["MSE"] += w * sqdiff; - res.errors["local"]["median"]["NMSE"] += w * sqdiff / reality; + res.errors["global"][0]["median"]["NMAE"] += std::abs(diff / reality); + res.errors["global"][0]["median"]["MSE"] += sqdiff; + res.errors["global"][0]["median"]["NMSE"] += sqdiff / reality; double inside = ((reality <= quants_oob[2]) && (reality >= quants_oob[0])) ? 1.0 : 0.0; - res.errors["global"]["ci"]["cov"] += inside; + res.errors["global"][0]["ci"]["cov"] += inside; rCI.push_back(CI); relativeCI.push_back(CI / reality); + for (auto j = 0; j < num_samples; j++) + { + auto w = preds[4][j][p.first]; + sumw[j] += w; + res.errors["local"][j]["median"]["NMAE"] += w * std::abs(diff / reality); + res.errors["local"][j]["median"]["MSE"] += w * sqdiff; + res.errors["local"][j]["median"]["NMSE"] += w * sqdiff / reality; + } + // mutex_quant.lock(); // mutex_quant.unlock(); } } // }); - for(auto i = 0; i < nref; i++) { + if (sumw[0] == 0.0 && ntest != 0 && !quiet) + std::cout << "Warning : not enough oob samples to have local errors with median as point estimates" << std::endl; + res.point_estimates = std::vector>(num_samples); + for (auto j = 0; j < num_samples; j++) + { + std::map point_estimates{ + {"Expectation", expectation[j]}, + {"Median", quants[j][1]}, + {"Quantile_0.05", quants[j][0]}, + {"Quantile_0.95", quants[j][2]}, + {"Variance", res.errors["local"][j]["mean"]["MSE"]}}; + for (auto p : point_estimates) + res.point_estimates[j].insert(p); } - if (sumw == 0.0 && ntest != 0 && !quiet) std::cout << "Warning : not enough oob samples to have local errors with median as point estimates" << std::endl; - - std::map point_estimates{ - { "Expectation", expectation }, - { "Median", quants[1] }, - { "Quantile_0.05", quants[0] }, - { "Quantile_0.95", quants[2] }, - { "Variance", res.errors["local"]["mean"]["MSE"] } - }; - os << "Parameter estimation (point estimates)" << std::endl; - for(auto p: point_estimates) os << fmt::format("{:>14}",p.first); - os << std::endl; - for(auto p: point_estimates) os << fmt::format("{:>14.6f}",p.second); - os << std::endl; - for(auto p: point_estimates) res.point_estimates.insert(p); + if (num_samples > 1) os << fmt::format("{:>14}", "Target n°"); + for (auto p : res.point_estimates[0]) os << fmt::format("{:>14}", p.first); os << std::endl; + for (auto j = 0; j < num_samples; j++) + { + if (num_samples > 1) + os << fmt::format("{:>14}", j + 1); + for (auto p : res.point_estimates[j]) + os << fmt::format(" {:>13.6g}", p.second); + os << std::endl; + } - if (!quiet) { + if (!quiet) + { std::cout << std::endl; std::cout << os.str(); std::cout.flush(); } - const std::string& predict_filename = outfile + ".predictions"; + const std::string &predict_filename = outfile + ".predictions"; std::ofstream predict_file; - if (!quiet) { + if (!quiet) + { predict_file.open(predict_filename, std::ios::out); - if (!predict_file.good()) { + if (!predict_file.good()) + { throw std::runtime_error("Could not write to prediction file: " + predict_filename + "."); } predict_file << os.str(); predict_file.flush(); predict_file.close(); } - + // std::cout << "Sum weights : " << sumw << std::endl; os.clear(); os.str(""); double acc; - for(auto g_l : global_local) { - os << g_l.second << std::endl; - for(auto m: mean_median_ci_l) { - double normalize = 1.0; - if (g_l.first == "global") - normalize = (m != "mean") ? ntest : nref; - else normalize = (m == "median") ? sumw : 1; - if (m != "ci") { - os << mean_median_ci[m] << std::endl; - for (auto n : computed) { - res.errors[g_l.first][m][n] /= static_cast(normalize); - os << fmt::format("{:>25} : {:<13}",n, res.errors[g_l.first][m][n]) << std::endl; - } - } - else { - if (g_l.first == "global") { + for (auto g_l : global_local) + { + os << g_l.second; + if (num_samples > 1 && g_l.first == "local") os << ", list of targets" << std::endl; + size_t loop_j = g_l.first == "global" ? 1 : num_samples; + for (size_t j = 0; j < loop_j; j++) + { + os << std::endl; + if (g_l.first == "local" && num_samples > 1) + os << "///////////// Target n° " << j + 1 << std::endl; + for (auto m : mean_median_ci_l) + { + double normalize; + if (g_l.first == "global") + normalize = (m != "mean") ? ntest : nref; + else + normalize = (m == "median") ? sumw[j] : 1; + if (m != "ci") + { os << mean_median_ci[m] << std::endl; - for(auto c : ci) { - if (c.first == "cov") { - acc = res.errors[g_l.first][m][c.first]/static_cast(normalize); - } else { - if(c.first == "meanCI") - acc = ranges::accumulate(rCI,0.0)/static_cast(normalize); - else if (c.first == "meanRCI") - acc = ranges::accumulate(relativeCI,0.0)/static_cast(normalize); - else if (c.first == "medianCI") - acc = median(rCI); - else if (c.first == "medianRCI") - acc = median(relativeCI); + for (auto n : computed) + { + res.errors[g_l.first][j][m][n] /= static_cast(normalize); + os << fmt::format("{:>25} : {:<13}", n, res.errors[g_l.first][j][m][n]) << std::endl; + } + } + else + { + if (g_l.first == "global") + { + os << mean_median_ci[m] << std::endl; + for (auto c : ci) + { + if (c.first == "cov") + { + acc = res.errors[g_l.first][j][m][c.first] / static_cast(normalize); + } + else + { + if (c.first == "meanCI") + acc = ranges::accumulate(rCI, 0.0) / static_cast(normalize); + else if (c.first == "meanRCI") + acc = ranges::accumulate(relativeCI, 0.0) / static_cast(normalize); + else if (c.first == "medianCI") + acc = median(rCI); + else if (c.first == "medianRCI") + acc = median(relativeCI); + } + res.errors[g_l.first][j][m][c.first] = acc; + os << fmt::format("{:>25} : {:<13}", c.second, acc) << std::endl; } - res.errors[g_l.first][m][c.first] = acc; - os << fmt::format("{:>25} : {:<13}",c.second, acc) << std::endl; - } - } - } } os << std::endl; } - - if (!quiet) { + if (!quiet) + { std::cout << os.str(); std::cout.flush(); } - const std::string& teststats_filename = outfile + ".oobstats"; + const std::string &teststats_filename = outfile + ".oobstats"; std::ofstream teststats_file; - if (!quiet) { + if (!quiet) + { teststats_file.open(teststats_filename, std::ios::out); - if (!teststats_file.good()) { - throw std::runtime_error("Could not write to oobstats file " + teststats_filename + "."); + if (!teststats_file.good()) + { + throw std::runtime_error("Could not write to oobstats file " + teststats_filename + "."); } teststats_file << os.str(); teststats_file.flush(); @@ -466,22 +534,46 @@ EstimParamResults EstimParam_fun(Reftable &myread, // mer_file << fmt::format("{:>33.6f}",res.errors["local"]["mean"]["NMAE"]); // mer_file << fmt::format("{:>33.6f}",res.errors["local"]["median"]["NMAE"]); // mer_file << fmt::format("{:>33.6f}",res.errors["global"]["ci"]["cov"]); - // mer_file << endl; + // mer_file << endl; // } return res; } -template -EstimParamResults EstimParam_fun(Reftable &myread, +template +EstimParamResults EstimParam_fun(Reftable &myread, std::vector origobs, const cxxopts::ParseResult opts, bool quiet, - bool weights); + bool weights) +{ + auto nstat = myread.stats_names.size(); + MatrixXd statobs(1, nstat); + statobs = Map(origobs.data(), 1, nstat); + return EstimParam_fun(myread,statobs,opts,quiet,weights); +} -template -EstimParamResults EstimParam_fun(Reftable>> &myread, - std::vector origobs, - const cxxopts::ParseResult opts, - bool quiet, - bool weights); \ No newline at end of file +template EstimParamResults EstimParam_fun(Reftable &myread, + MatrixXd origobs, + const cxxopts::ParseResult opts, + bool quiet, + bool weights); + +template EstimParamResults EstimParam_fun(Reftable>> &myread, + MatrixXd origobs, + const cxxopts::ParseResult opts, + bool quiet, + bool weights); + + +template EstimParamResults EstimParam_fun(Reftable &myread, + std::vector origobs, + const cxxopts::ParseResult opts, + bool quiet, + bool weights); + +template EstimParamResults EstimParam_fun(Reftable>> &myread, + std::vector origobs, + const cxxopts::ParseResult opts, + bool quiet, + bool weights); \ No newline at end of file diff --git a/src/EstimParam.hpp b/src/EstimParam.hpp index 998bfaa..56febce 100644 --- a/src/EstimParam.hpp +++ b/src/EstimParam.hpp @@ -4,25 +4,33 @@ #include #include "Reftable.hpp" #include "cxxopts.hpp" -#include -struct EstimParamResults { +struct EstimParamResults +{ std::vector plsvar; - std::vector> plsweights; - std::vector> variable_importance; + std::vector> plsweights; + std::vector> variable_importance; std::vector ntree_oob_error; - std::vector> values_weights; - std::map oob_map; + std::vector> values_weights; + std::map oob_map; Eigen::MatrixXd oob_weights; - std::map point_estimates; - std::map>> errors; + std::vector> point_estimates; + std::map>>> + errors; }; -template +template EstimParamResults EstimParam_fun(Reftable &reftable, - std::vector statobs, - const cxxopts::ParseResult opts, - bool quiet = false, - bool weights = false); \ No newline at end of file + MatrixXd statobs, + const cxxopts::ParseResult opts, + bool quiet = false, + bool weights = false); + +template +EstimParamResults EstimParam_fun(Reftable &reftable, + std::vector statobs, + const cxxopts::ParseResult opts, + bool quiet = false, + bool weights = false); \ No newline at end of file diff --git a/src/ForestOnlineRegression.cpp b/src/ForestOnlineRegression.cpp index 72b2e1a..fe186e9 100644 --- a/src/ForestOnlineRegression.cpp +++ b/src/ForestOnlineRegression.cpp @@ -119,7 +119,8 @@ void ForestOnlineRegression::predictInternal(size_t tree_idx) predictions[1][sample_idx][tree_idx] = static_cast(getTreePredictionTerminalNodeID(tree_idx, sample_idx)); else { auto value = getTreePrediction(tree_idx, sample_idx); - if (std::isnan(value)) throw std::runtime_error("NaN value"); + // if (std::isnan(value)) throw std::runtime_error("NaN value"); + if (std::isnan(value)) next; mutex_samples[sample_idx].lock(); predictions[1][0][sample_idx] += value; mutex_samples[sample_idx].unlock(); diff --git a/src/ModelChoice.cpp b/src/ModelChoice.cpp index 87b7399..46792ed 100644 --- a/src/ModelChoice.cpp +++ b/src/ModelChoice.cpp @@ -21,7 +21,7 @@ using namespace ranges; template ModelChoiceResults ModelChoice_fun(Reftable &myread, - std::vector obs, + MatrixXd statobs, const cxxopts::ParseResult opts, bool quiet) { @@ -43,12 +43,11 @@ ModelChoiceResults ModelChoice_fun(Reftable &myread, std::vector samplefract{std::min(1e5,static_cast(myread.nrec))/static_cast(myread.nrec)}; auto nstat = myread.stats_names.size(); size_t K = myread.nrecscen.size(); - MatrixXd statobs(1, nstat); MatrixXd emptyrow(1,0); + size_t num_samples = statobs.rows(); size_t n = myread.nrec; - statobs = Map(obs.data(), 1, nstat); MatrixXd data_extended(n,0); @@ -92,7 +91,7 @@ ModelChoiceResults ModelChoice_fun(Reftable &myread, for(auto i = 0; i < varwithouty.size(); i++) varwithouty[i] = myread.stats_names[i]; - auto datastatobs = unique_cast, Data>(std::make_unique>(statobs, emptyrow, varwithouty, 1, varwithouty.size())); + auto datastatobs = unique_cast, Data>(std::make_unique>(statobs, emptyrow, varwithouty, num_samples, varwithouty.size())); auto datastats = unique_cast, Data>(std::make_unique>(myread.stats, data_extended, myread.stats_names, myread.nrec, myread.stats_names.size())); ForestOnlineClassification forestclass; forestclass.init("Y", // dependant variable @@ -143,12 +142,13 @@ ModelChoiceResults ModelChoice_fun(Reftable &myread, res.ntree_oob_error = preds[2][0]; if (!quiet) forestclass.writeOOBErrorFile(); - vector votes(K); - for(auto& tree_pred : preds[1][0]) votes[static_cast(tree_pred-1)]++; - res.votes = votes; - - size_t predicted_model = std::distance(votes.begin(),std::max_element(votes.begin(),votes.end())); - res.predicted_model = predicted_model; + size_t nobs = statobs.rows(); + res.votes = std::vector>(num_samples,std::vector(K)); + res.predicted_model = std::vector(num_samples); + for(auto i = 0; i < num_samples; i++) { + for(auto& tree_pred : preds[1][i]) res.votes[i][static_cast(tree_pred-1)]++; + res.predicted_model[i] = std::distance(res.votes[i].begin(),std::max_element(res.votes[i].begin(),res.votes[i].end())); + } size_t ycol = data_extended.cols() - 1; @@ -219,20 +219,26 @@ ModelChoiceResults ModelChoice_fun(Reftable &myread, myread.stats_names.resize(nstat); auto predserr = forestreg.getPredictions(); - res.post_proba = predserr[1][0][0]; + res.post_proba = std::vector(num_samples); + for(auto i = 0; i < num_samples; i++) res.post_proba[i] = predserr[1][0][i]; const std::string& predict_filename = outfile + ".predictions"; std::ostringstream os; - for(auto i = 0; i < votes.size(); i++) { + if (num_samples > 1) os << fmt::format("{:>14}", "Target n°"); + for(auto i = 0; i < K; i++) { os << fmt::format("{:>14}",fmt::format("votes model{0}",i+1)); } os << fmt::format(" selected model"); - os << fmt::format(" post proba\n"); - for(auto i = 0; i < votes.size(); i++) { - os << fmt::format("{:>14}",votes[i]); + os << fmt::format(" post proba\n"); + for (auto j = 0; j < num_samples; j++) { + if (num_samples > 1) + os << fmt::format("{:>14}", j + 1); + for(auto i = 0; i < K; i++) { + os << fmt::format(" {:>13}",res.votes[j][i]); + } + os << fmt::format("{:>15}", res.predicted_model[j] + 1); + os << fmt::format("{:12.3f}\n",res.post_proba[j]); } - os << fmt::format("{:>15}", predicted_model + 1); - os << fmt::format("{:11.3f}\n",res.post_proba); if (!quiet) std::cout << os.str(); std::cout.flush(); @@ -277,6 +283,30 @@ ModelChoiceResults ModelChoice_fun(Reftable &myread, return res; } +template +ModelChoiceResults ModelChoice_fun(Reftable &myread, + std::vector origobs, + const cxxopts::ParseResult opts, + bool quiet) +{ + auto nstat = myread.stats_names.size(); + MatrixXd statobs(1, nstat); + statobs = Map(origobs.data(), 1, nstat); + return ModelChoice_fun(myread,statobs,opts,quiet); +} + +template +ModelChoiceResults ModelChoice_fun(Reftable &myread, + MatrixXd obs, + const cxxopts::ParseResult opts, + bool quiet); + +template +ModelChoiceResults ModelChoice_fun(Reftable>> &myread, + MatrixXd obs, + const cxxopts::ParseResult opts, + bool quiet); + template ModelChoiceResults ModelChoice_fun(Reftable &myread, std::vector obs, diff --git a/src/ModelChoice.hpp b/src/ModelChoice.hpp index d08e704..b0b7e30 100644 --- a/src/ModelChoice.hpp +++ b/src/ModelChoice.hpp @@ -10,11 +10,17 @@ struct ModelChoiceResults std::vector> confusion_matrix; std::vector> variable_importance; std::vector ntree_oob_error; - size_t predicted_model; - std::vector votes; - double post_proba; + std::vector predicted_model; + std::vector> votes; + std::vector post_proba; }; +template +ModelChoiceResults ModelChoice_fun(Reftable &reftable, + MatrixXd statobs, + const cxxopts::ParseResult opts, + bool quiet = false); + template ModelChoiceResults ModelChoice_fun(Reftable &reftable, std::vector statobs, diff --git a/src/pls-eigen.hpp b/src/pls-eigen.hpp index a0361be..ccbe4b1 100644 --- a/src/pls-eigen.hpp +++ b/src/pls-eigen.hpp @@ -24,6 +24,16 @@ using namespace Eigen; using namespace std; using namespace ranges; +template +std::vector filterConstantVars(const MatrixBase& xr) { + auto meanr = xr.colwise().mean(); + auto stdr = ((xr.rowwise() - meanr).array().square().colwise().sum() / (xr.rows() - 1)).sqrt();; + std::vector validvars; + for(size_t i = 0; i< xr.cols(); i++) { + if (stdr(i) >= 1.0e-8) validvars.push_back(i); + } + return validvars; +} template VectorXd pls(const MatrixBase& x, diff --git a/src/pyabcranger/pyabcranger.cpp b/src/pyabcranger/pyabcranger.cpp index 9ce526a..c724f37 100644 --- a/src/pyabcranger/pyabcranger.cpp +++ b/src/pyabcranger/pyabcranger.cpp @@ -62,14 +62,29 @@ ModelChoiceResults ModelChoice_fun_py(Reftable> &reftabl return ModelChoice_fun(reftable,statobs,parseopt(options),quiet); } +ModelChoiceResults ModelChoice_multi_fun_py(Reftable> &reftable, + MatrixXd& statobs, + std::string options, + bool quiet = false) { + return ModelChoice_fun(reftable,statobs,parseopt(options),quiet); +} + +EstimParamResults EstimParam_multi_fun_py(Reftable> &reftable, + MatrixXd& statobs, + std::string options, + bool quiet = false, + bool weights = false) { + return EstimParam_fun(reftable,statobs,parseopt(options),quiet,weights); +} + EstimParamResults EstimParam_fun_py(Reftable> &reftable, std::vector statobs, std::string options, bool quiet = false, bool weights = false) { return EstimParam_fun(reftable,statobs,parseopt(options),quiet,weights); -} - +} + using namespace Eigen; PYBIND11_MODULE(pyabcranger, m) { @@ -101,7 +116,9 @@ PYBIND11_MODULE(pyabcranger, m) { .def_readwrite("errors",&EstimParamResults::errors); m.def("modelchoice", &ModelChoice_fun_py, py::call_guard()); + m.def("modelchoice_multi", &ModelChoice_multi_fun_py, py::call_guard()); m.def("estimparam", &EstimParam_fun_py, py::call_guard()); + m.def("estimparam_multi", &EstimParam_multi_fun_py, py::call_guard()); m.def("forestQuantiles_b", [](const std::vector& obs, const std::vector>& weights, const std::vector& asked){ diff --git a/test/data/statobsRF2.txt b/test/data/statobsRF2.txt new file mode 100644 index 0000000..7151f9c --- /dev/null +++ b/test/data/statobsRF2.txt @@ -0,0 +1,4 @@ + ML1p_1 ML1p_2 ML1p_3 ML1p_4 ML2p_1.2 ML2p_1.3 ML2p_1.4 ML2p_2.3 ML2p_2.4 ML2p_3.4 ML3p_1.2.3 ML3p_1.2.4 ML3p_1.3.4 ML3p_2.3.4 HWm_1 HWm_2 HWm_3 HWm_4 HWv_1 HWv_2 HWv_3 HWv_4 HBm_1.2 HBm_1.3 HBm_1.4 HBm_2.3 HBm_2.4 HBm_3.4 HBv_1.2 HBv_1.3 HBv_1.4 HBv_2.3 HBv_2.4 HBv_3.4 FST1m_1 FST1m_2 FST1m_3 FST1m_4 FST1v_1 FST1v_2 FST1v_3 FST1v_4 FST2m_1.2 FST2m_1.3 FST2m_1.4 FST2m_2.3 FST2m_2.4 FST2m_3.4 FST2v_1.2 FST2v_1.3 FST2v_1.4 FST2v_2.3 FST2v_2.4 FST2v_3.4 NEIm_1.2 NEIm_1.3 NEIm_1.4 NEIm_2.3 NEIm_2.4 NEIm_3.4 NEIv_1.2 NEIv_1.3 NEIv_1.4 NEIv_2.3 NEIv_2.4 NEIv_3.4 AMLm_1.2.3 AMLm_2.1.3 AMLm_3.1.2 AMLm_1.2.4 AMLm_2.1.4 AMLm_4.1.2 AMLm_1.3.4 AMLm_3.1.4 AMLm_4.1.3 AMLm_2.3.4 AMLm_3.2.4 AMLm_4.2.3 AMLv_1.2.3 AMLv_2.1.3 AMLv_3.1.2 AMLv_1.2.4 AMLv_2.1.4 AMLv_4.1.2 AMLv_1.3.4 AMLv_3.1.4 AMLv_4.1.3 AMLv_2.3.4 AMLv_3.2.4 AMLv_4.2.3 FST3m_1.2.3 FST3m_1.2.4 FST3m_1.3.4 FST3m_2.3.4 FST3v_1.2.3 FST3v_1.2.4 FST3v_1.3.4 FST3v_2.3.4 FST4m_1.2.3.4 FST4v_1.2.3.4 F3m_1.2.3 F3m_2.1.3 F3m_3.1.2 F3m_1.2.4 F3m_2.1.4 F3m_4.1.2 F3m_1.3.4 F3m_3.1.4 F3m_4.1.3 F3m_2.3.4 F3m_3.2.4 F3m_4.2.3 F3v_1.2.3 F3v_2.1.3 F3v_3.1.2 F3v_1.2.4 F3v_2.1.4 F3v_4.1.2 F3v_1.3.4 F3v_3.1.4 F3v_4.1.3 F3v_2.3.4 F3v_3.2.4 F3v_4.2.3 F4m_1.2.3.4 F4m_1.3.2.4 F4m_1.4.2.3 F4v_1.2.3.4 F4v_1.3.2.4 F4v_1.4.2.3 + + 0.16040000 0.20900000 0.17140000 0.13560000 0.04760000 0.04040000 0.04800000 0.08960000 0.05320000 0.06220000 0.01180000 0.00980000 0.01740000 0.02820000 0.24573415 0.23733326 0.24383954 0.24676333 0.03115738 0.03385790 0.03262390 0.03103325 0.26197939 0.26098241 0.25774253 0.25476021 0.25756725 0.25379183 0.02926918 0.02846975 0.02784482 0.03051622 0.02969639 0.02902693 0.04681769 0.07940407 0.05416672 0.04282559 0.46879378 0.50942572 0.49085903 0.46692610 0.07836146 0.06225006 0.04458234 0.05549477 0.06025723 0.03358946 0.00945500 0.00778649 0.00575185 0.00717580 0.00750894 0.00455083 0.03503132 0.02976547 0.02398338 0.02749575 0.02921387 0.02048164 0.00441123 0.00338649 0.00230662 0.00282876 0.00331057 0.00172847 0.46100068 0.43007067 0.46660871 0.41585252 0.45461734 0.53559965 0.44427018 0.41160874 0.46579589 0.52307069 0.45610501 0.42763848 0.20082501 0.19845348 0.18797401 0.19051696 0.20904682 0.18271819 0.20296278 0.19137962 0.18182854 0.20989997 0.19033713 0.18543345 0.06555334 0.06133700 0.04699794 0.04994778 0.00546414 0.00500669 0.00396932 0.00426342 0.05605008 0.00364899 0.01123372 0.00921197 0.00496184 0.00821026 0.01223543 0.00328353 0.00959948 0.00659609 0.00189431 0.01060119 0.00357263 0.00491777 0.00118795 0.00098739 0.00071405 0.00090522 0.00111591 0.00065685 0.00086428 0.00061343 0.00046779 0.00091743 0.00050876 0.00064362 -0.00302346 -0.00163424 0.00138922 0.00057110 0.00065931 0.00050151 + 0.16040000 0.20900000 0.16140000 0.13560000 0.01760000 0.04040000 0.04800000 0.08960000 0.05320000 0.06220000 0.01180000 0.00980000 0.01740000 0.02820000 0.24573415 0.23733326 0.24383954 0.24676333 0.03115738 0.03385790 0.03262390 0.03103325 0.26197939 0.26098241 0.25774253 0.25476021 0.25756725 0.25379183 0.02926918 0.02846975 0.02784482 0.03051622 0.02969639 0.02902693 0.04681769 0.07940407 0.05416672 0.04282559 0.46879378 0.50942572 0.49085903 0.46692610 0.07836146 0.06225006 0.04458234 0.05549477 0.06025723 0.03358946 0.00945500 0.00778649 0.00575185 0.00717580 0.00750894 0.00455083 0.03503132 0.02976547 0.02398338 0.02749575 0.02921387 0.02048164 0.00441123 0.00338649 0.00230662 0.00282876 0.00331057 0.00172847 0.46100068 0.43007067 0.46660871 0.41585252 0.45461734 0.53559965 0.44427018 0.41160874 0.46579589 0.52307069 0.45610501 0.42763848 0.20082501 0.19845348 0.18797401 0.19051696 0.20904682 0.18271819 0.20296278 0.19137962 0.18182854 0.20989997 0.19033713 0.18543345 0.06555334 0.06133700 0.04699794 0.04994778 0.00546414 0.00500669 0.00396932 0.00426342 0.05605008 0.00364899 0.01123372 0.00921197 0.00496184 0.00821026 0.01223543 0.00328353 0.00959948 0.00659609 0.00189431 0.01060119 0.00357263 0.00491777 0.00118795 0.00098739 0.00071405 0.00090522 0.00111591 0.00065685 0.00086428 0.00061343 0.00046779 0.00091743 0.00050876 0.00064362 -0.00302346 -0.00163424 0.00138922 0.00057110 0.00065931 0.00040151 diff --git a/test/forestmodelchoice-ks-test.cpp b/test/forestmodelchoice-ks-test.cpp index f547f8f..6a630ee 100644 --- a/test/forestmodelchoice-ks-test.cpp +++ b/test/forestmodelchoice-ks-test.cpp @@ -64,7 +64,7 @@ TEST_CASE("ModelChoice KS test") bar.progress(i,nrun); auto res = ModelChoice_fun(myread,statobs,opts,true); - postprobas[i] = res.post_proba; + postprobas[i] = res.post_proba[0]; } std::cout << std::endl; double D, pvalue; diff --git a/test/test.py b/test/test.py index 7f0c632..f35e8b8 100644 --- a/test/test.py +++ b/test/test.py @@ -10,6 +10,13 @@ def test_modelchoice(path): os.remove(filePath) subprocess.run(path) +def test_modelchoice_multi(path): + """Run basic multi target Model choice example + """ + for filePath in glob.glob('modelchoice_out.*'): + os.remove(filePath) + subprocess.call([path,"-b","statobsRF2.txt"]) + def test_estimparam(path): """Run basic Parameter estimation example """ @@ -17,6 +24,13 @@ def test_estimparam(path): os.remove(filePath) subprocess.call([path,"--parameter","ra","--chosenscen","3","--noob","50"]) +def test_estimparam_multi(path): + """Run basic multi target Parameter estimation example + """ + for filePath in glob.glob('estimparam_out.*'): + os.remove(filePath) + subprocess.call([path,"-b","statobsRF2.txt","--parameter","ra","--chosenscen","3","--noob","50"]) + def test_parallel(path): """Check multithreaded performance """