cavalab · gAldeia · Jan 3, 2024 · Jan 3, 2024 · Jan 3, 2024 · Jan 16, 2024
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -23,7 +23,7 @@ jobs:
       #     cache-env: true
       - 
         name: add docs environment dependencies
-        uses: mamba-org/provision-with-micromamba@main
+        uses: mamba-org/setup-micromamba@v1
         with:
           environment-file: environment.yml
           cache-env: true

diff --git a/ci/ci-environment.yml b/ci/ci-environment.yml
@@ -22,6 +22,7 @@ dependencies:
     - sphinx-material
     - recommonmark
     - nbsphinx
+    - lxml_html_clean
     - matplotlib
     - jupyter
     - seaborn

diff --git a/docs/conf.py b/docs/conf.py
@@ -56,6 +56,7 @@
     'sphinx_math_dollar',
     # 'recommonmark',
     'nbsphinx',
+    'lxml_html_clean',
     # "sphinx.ext.viewcode",
     # External stuff
 ]

diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -10,3 +10,4 @@ myst-parser
 nbsphinx
 sphinx-material
 sphinx-math-dollar
+lxml_html_clean
diff --git a/feat/feat.py b/feat/feat.py
@@ -184,7 +184,7 @@ def __init__(self,
                  softmax_norm=False, 
                  save_pop=0, 
                  normalize=True, 
-                 val_from_arch=True, 
+                 val_from_arch=False, 
                  corr_delete_mutate=False, 
                  simplify=0.0, 
                  protected_groups="", 
@@ -316,8 +316,8 @@ def predict_archive(self,X,Z=None,front=False):
         archive = self.cfeat_.get_archive(front)
         preds = []
         for ind in archive:
-            if ind['id'] == 9234:
-                print('individual:',json.dumps(ind,indent=2))
+            # if ind['id'] == 9234:
+            #     print('individual:',json.dumps(ind,indent=2))
             tmp = {}
             tmp['id'] = ind['id']
             tmp['y_pred'] = self.cfeat_.predict_archive(ind['id'], X) 
@@ -399,6 +399,7 @@ def get_representation(self): return self.cfeat_.get_representation()
     def get_model(self, sort=True): return self.cfeat_.get_model(sort)
     def get_coefs(self): return self.cfeat_.get_coefs()
     def get_n_params(self): return self.cfeat_.get_n_params()
+    def get_complexity(self): return self.cfeat_.get_complexity()
     def get_dim(self): return self.cfeat_.get_dim()
     def get_n_nodes(self): return self.cfeat_.get_n_nodes()
 
@@ -432,7 +433,7 @@ def fit(self,X,y,zfile=None,zids=None):
         ])):
             raise ValueError('y must be a contiguous set of labels from ',
                              '0 to n_classes. y contains the values {}'.format(
-                                 np.unique(np.asarray(y)))
+                                self.classes_)
                             )
 
         super().fit(X,y)

diff --git a/src/eval/metrics.cc b/src/eval/metrics.cc
@@ -362,6 +362,7 @@ namespace FT
 
             return loss;
         }
+
         /// 1 - balanced accuracy 
         float bal_zero_one_loss(const VectorXf& y, const VectorXf& yhat, 
                 VectorXf& loss, const vector<float>& class_weights)
@@ -406,6 +407,7 @@ namespace FT
             // set loss vectors if third argument supplied
             loss = (yhat.cast<int>().array() != y.cast<int>().array()).cast<float>();
 
+            // 1 - accuracy (so it becomes a minimization problem)
             return 1.0 - class_accuracies.mean();
         }
 
@@ -435,7 +437,11 @@ namespace FT
         float zero_one_loss(const VectorXf& y, const VectorXf& yhat, VectorXf& loss, 
                    const vector<float>& class_weights)
         {
+            // Feat's update_best and sel/surv steps always handles scores as
+            // minimization problems, so we need to invert the loss here. That's
+            // why we account for mismatches instead of correct classifications:
             loss = (yhat.cast<int>().array() != y.cast<int>().array()).cast<float>();
+
             //TODO: weight loss by sample weights
             return loss.mean();
         }

diff --git a/src/feat.cc b/src/feat.cc
@@ -567,8 +567,12 @@ int Feat::get_n_params(){ return best_ind.get_n_params(); }
 int Feat::get_dim(){ return best_ind.get_dim(); } 
 
 ///get dimensionality of best
-int Feat::get_complexity(){ return best_ind.get_complexity(); } 
-
+int Feat::get_complexity(){
+    // Making sure it is calculated before returning it
+    if (best_ind.get_complexity()==0)
+        best_ind.set_complexity();
+    return best_ind.get_complexity();
+} 
 
 /// return the number of nodes in the best model
 int Feat::get_n_nodes(){ return best_ind.program.size(); }
@@ -707,15 +711,14 @@ void Feat::run_generation(unsigned int g,
     pop.update(survivors);
     logger.log("survivors:\n" + pop.print_eqns(), 3);
 
+    // we need to update best, so min_loss_v is updated inside stats
     logger.log("update best...",2);
     bool updated_best = update_best(d);
 
-    logger.log("calculate stats...",2);
-    calculate_stats(d);
-
     if (params.max_stall > 0)
         update_stall_count(stall_count, updated_best);
 
+    logger.log("update objectives...",2);
     if ( (use_arch || params.verbosity>1) || !logfile.empty()) {
         // set objectives to make sure they are reported in log/verbose/arch
         #pragma omp parallel for
@@ -727,6 +730,9 @@ void Feat::run_generation(unsigned int g,
     if (use_arch) 
         archive.update(pop,params);
 
+    logger.log("calculate stats...",2);
+    calculate_stats(d);
+
     if(params.verbosity>1)
         print_stats(log, fraction);    
     else if(params.verbosity == 1)
@@ -1293,7 +1299,7 @@ ArrayXXf Feat::predict_proba(MatrixXf& X)
 }
 
 
-bool Feat::update_best(const DataRef& d, bool validation)
+bool Feat::update_best(const DataRef& d, bool val)
 {
     float bs;
     bs = this->min_loss_v; 
@@ -1463,7 +1469,7 @@ void Feat::print_stats(std::ofstream& log, float fraction)
               << stats.min_loss.back() << " (" 
               << stats.med_loss.back() << ")\n"
               << "Val Loss (Med): " 
-              << this->min_loss_v << " (" << stats.med_loss_v.back() << ")\n"
+              << stats.min_loss_v.back() << " (" << stats.med_loss_v.back() << ")\n"
               << "Median Size (Max): " 
               << stats.med_size.back() << " (" << max_size << ")\n"
               << "Time (s): "   << timer << "\n";
@@ -1553,7 +1559,7 @@ void Feat::log_stats(std::ofstream& log)
     log << params.current_gen          << sep
         << timer.Elapsed().count()     << sep
         << stats.min_loss.back()       << sep
-        << this->min_loss_v            << sep
+        << stats.min_loss_v.back()     << sep
         << stats.med_loss.back()       << sep
         << stats.med_loss_v.back()     << sep
         << stats.med_size.back()       << sep

diff --git a/src/feat.h b/src/feat.h
@@ -93,7 +93,7 @@ class Feat
         //      string logfile="", int max_time=-1, bool residual_xo = false, 
         //      bool stagewise_xo = false, bool stagewise_tol = true, 
         //      bool softmax_norm=false, int save_pop=0, bool normalize=true, 
-        //      bool val_from_arch=true, bool corr_delete_mutate=false, 
+        //      bool val_from_arch=false, bool corr_delete_mutate=false, 
         //      float simplify=0.0, string protected_groups="",
         //      bool tune_initial=false, bool tune_final=true,
         //      string starting_pop="");
@@ -325,7 +325,7 @@ class Feat
         int get_n_params();
         ///get dimensionality of best
         int get_dim();
-        ///get dimensionality of best
+        ///get complexity of best
         int get_complexity();
         ///return population as string
         vector<nl::json> get_archive(bool front);

diff --git a/src/pybind.cc b/src/pybind.cc
@@ -150,6 +150,7 @@ PYBIND11_MODULE(_feat, m)
         .def("load", &Feat::load)
         .def("get_representation", &Feat::get_representation)
         .def("get_n_params", &Feat::get_n_params)
+        .def("get_complexity", &Feat::get_complexity)
         .def("get_dim", &Feat::get_dim)
         .def("get_n_nodes", &Feat::get_n_nodes)
         .def("get_model", &Feat::get_model, py::arg("sort") = true)

diff --git a/tests/evaluationTests.cc b/tests/evaluationTests.cc
@@ -53,6 +53,58 @@ TEST(Evaluation, mse)
     ASSERT_TRUE(score == 28.5);
 }
 
+TEST(Evaluation, accuracy)
+{
+    // test zero one loss
+
+    Feat ft = make_estimator(100, 10, "LinearRidgeRegression", false, 1, 666);
+
+    VectorXf yhat(10), y(10), res(10), loss(10);
+
+    y << 0.0,
+         1.0,
+         0.0,
+         0.0,
+         1.0,
+         0.0,
+         0.0,
+         1.0,
+         0.0,
+         0.0;
+
+    yhat << 0.0,
+	        1.0,
+	        1.0,
+	        0.0,
+	        0.0, 
+	        1.0,
+	        1.0,
+	        0.0,
+	        0.0,
+	        0.0;
+
+    res << 0.0,
+           0.0,
+           1.0,
+           0.0,
+           1.0,
+           1.0,
+           1.0,
+           1.0,
+           0.0,
+           0.0;
+
+    float score = zero_one_loss(y, yhat, loss, ft.params.class_weights);
+
+    if (loss != res)
+    {
+        std::cout << "loss:" << loss.transpose() << "\n";
+        std::cout << "res:" << res.transpose() << "\n";
+    }
+    ASSERT_TRUE(loss == res);
+    ASSERT_EQ(((int)(score*1000000)), 500000);
+}
+
 TEST(Evaluation, bal_accuracy)
 {
     // test balanced zero one loss

diff --git a/tests/wrappertest.py b/tests/wrappertest.py
@@ -67,7 +67,8 @@ def test_sklearn_api(self):
                        'check_fit2d_1sample',
                        'check_fit2d_1feature',
                        'check_transformer_data_not_an_array',
-                       'check_transformer_preserve_dtypes' 
+                       'check_transformer_preserve_dtypes',
+                       'check_estimators_dtypes'
                       ]
         for est, check in check_generator2:
             time_to_go=False