Skip to content

Commit

Permalink
Add kmeans++ initialization implementation.
Browse files Browse the repository at this point in the history
  • Loading branch information
dattatreya303 committed Jun 26, 2018
1 parent 77d9d7f commit 157e554
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 7 deletions.
8 changes: 5 additions & 3 deletions doc/samples/kmeans.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#include <boost/numeric/ublas/kmeans/kmeans.hpp>
#include <boost/numeric/ublas/kmeans/random_initialization.hpp>
#include <boost/numeric/ublas/kmeans/kmeans++.hpp>
#include <boost/numeric/ublas/kmeans/naive_kmeans.hpp>

#include <string.h>
Expand All @@ -24,7 +25,8 @@ int main() {
std::cin >> gt (i);
}

KMeans<RandomInitialization, NaiveKMeans> kmeans(100);
// KMeans<RandomInitialization, NaiveKMeans> kmeans(100);
KMeans<KMeansPlusPlus, NaiveKMeans> kmeans(100);
// std::cout << "init!" << std::endl;
int n = 3;
vector<int> assignments (data2.size1 ());
Expand All @@ -36,8 +38,8 @@ int main() {
// std::cout << row (data2, i) << " " << assignments (i) << " " << gt (i) << std::endl;
for (int j = 0; j < data2.size2 (); j++)
std::cout << data2 (i,j) << " ";
std::cout << assignments (i) << std::endl;
// std::cout << gt (i) << std::endl;
// std::cout << assignments (i) << std::endl;
std::cout << gt (i) << std::endl;
}
return 0;
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ namespace boost { namespace numeric { namespace ublas {
RefinedStart () {}

template <class MatrixType>
static void Initialize (const MatrixType &data, const size_type num_clusters, const matrix<double> &centroids) {}
static void Initialize (const MatrixType &data, const size_type num_clusters, matrix<double> &centroids) {}
};
}}}

Expand Down
47 changes: 46 additions & 1 deletion include/boost/numeric/ublas/kmeans/kmeans++.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,17 @@
#ifndef _BOOST_UBLAS_KMEANSPLUSPLUS_
#define _BOOST_UBLAS_KMEANSPLUSPLUS_

#include <boost/numeric/ublas/vector.hpp>
#include <boost/numeric/ublas/matrix.hpp>
#include <boost/numeric/ublas/matrix_proxy.hpp>

#include <boost/random/mersenne_twister.hpp>
#include <boost/random/discrete_distribution.hpp>
#include <boost/random/uniform_int_distribution.hpp>

#include <boost/container/set.hpp>

#include <ctime>

namespace boost { namespace numeric { namespace ublas {

Expand All @@ -23,7 +33,42 @@ namespace boost { namespace numeric { namespace ublas {
KMeansPlusPlus () {}

template <class MatrixType>
static void Initialize (const MatrixType &data, const size_type num_clusters, const matrix<double> &centroids) {}
static void Initialize (const MatrixType &data, const size_t num_clusters, matrix<double> &centroids) {
boost::random::mt19937 gen;
gen.seed(static_cast<unsigned int>(std::time(0)));

boost::random::uniform_int_distribution<> uniform_dist (0, data.size1 () - 1);
size_t first_centroid_index = uniform_dist (gen);
row (centroids, 0) = row (data, first_centroid_index);

boost::container::set<size_t> indices_selected_as_centroid;
indices_selected_as_centroid.insert (first_centroid_index);
boost::container::set<size_t>::iterator set_it = indices_selected_as_centroid.begin ();

vector<double> closest_centroid_distance (data.size1 ());
for (size_t i = 0; i < data.size1 (); ++ i)
closest_centroid_distance (i) = inner_prod (row (data, i) - row (data, *set_it), row (data, i) - row (data, *set_it));

vector<double> weights (data.size1 ());
for (size_t i = 1; i < num_clusters; ++ i) {
for (size_t data_counter = 0; data_counter < data.size1 (); ++ data_counter)
if (indices_selected_as_centroid.find (data_counter) == indices_selected_as_centroid.end ())
weights (data_counter) = 0;
else
weights (data_counter) = closest_centroid_distance (data_counter);

boost::random::discrete_distribution<> discrete_dist (weights);
size_t new_centroid_index = discrete_dist (gen);
row (centroids, i) = row (data, new_centroid_index);
indices_selected_as_centroid.insert (new_centroid_index);

for (size_t data_counter = 0; data_counter < data.size1 (); ++ data_counter) {
double new_centroid_distance = inner_prod (row (data, data_counter) - row (data, new_centroid_index), row (data, data_counter) - row (data, new_centroid_index));
if (new_centroid_distance < closest_centroid_distance (data_counter))
closest_centroid_distance (data_counter) = new_centroid_distance;
}
}
}
};
}}}

Expand Down
7 changes: 5 additions & 2 deletions include/boost/numeric/ublas/kmeans/random_initialization.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

#include <boost/container/set.hpp>

boost::random::mt19937 gen;
#include <ctime>

namespace boost { namespace numeric { namespace ublas {

Expand All @@ -38,8 +38,11 @@ namespace boost { namespace numeric { namespace ublas {
This will lead to empty clusters. Should be handled by EmptyClusterPolicy
or here only? eg set<size_t> selected_indices;
*/
boost::random::mt19937 gen;
gen.seed(static_cast<unsigned int> (std::time (0)));;

for (size_t i = 0; i < num_clusters; ++ i) {
boost::random::uniform_int_distribution<> dist(0, data.size1 () - 1);
boost::random::uniform_int_distribution<> dist (0, data.size1 () - 1);
size_t index = dist (gen);
row (centroids, i) = row (data, index);
// for (size_t j = 0; j < centroids.size2 (); ++j)
Expand Down

0 comments on commit 157e554

Please sign in to comment.