Skip to content

Commit

Permalink
Remove compilation bug. Naive Kmeans algorithm working with random in…
Browse files Browse the repository at this point in the history
…itialization. Needs refactoring.
  • Loading branch information
dattatreya303 committed Jun 25, 2018
1 parent 6f3dbec commit 77d9d7f
Show file tree
Hide file tree
Showing 6 changed files with 164 additions and 58 deletions.
43 changes: 43 additions & 0 deletions doc/samples/kmeans.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#include <boost/numeric/ublas/matrix.hpp>
#include <boost/numeric/ublas/matrix_proxy.hpp>
#include <boost/numeric/ublas/io.hpp>

#include <boost/numeric/ublas/kmeans/kmeans.hpp>
#include <boost/numeric/ublas/kmeans/random_initialization.hpp>
#include <boost/numeric/ublas/kmeans/naive_kmeans.hpp>

#include <string.h>
#include <iostream>

int main() {
using namespace boost::numeric::ublas;
std::string str_data = "[7,2]((1,1),(1.5,2),(3,4),(5,7),(3.5,5),(4.5,5),(3.5,4.5))";
matrix<double> data;
std::istringstream is(str_data);
is >> data;

matrix<double> data2 (150,4);
vector<int> gt (150);
for (int i = 0; i < data2.size1 (); ++i) {
for (int j = 0; j < data2.size2 (); ++j)
std::cin >> data2 (i,j);
std::cin >> gt (i);
}

KMeans<RandomInitialization, NaiveKMeans> kmeans(100);
// std::cout << "init!" << std::endl;
int n = 3;
vector<int> assignments (data2.size1 ());
matrix<double> centroids (n, data2.size2());
// std::cout << "init vars!" << std::endl;
kmeans.Cluster (data2, n, centroids, assignments);
// std::cout << "clusterd!" << std::endl;
for (int i = 0; i < data2.size1 (); i++) {
// std::cout << row (data2, i) << " " << assignments (i) << " " << gt (i) << std::endl;
for (int j = 0; j < data2.size2 (); j++)
std::cout << data2 (i,j) << " ";
std::cout << assignments (i) << std::endl;
// std::cout << gt (i) << std::endl;
}
return 0;
}
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
#ifndef _BOOST_UBLAS_REFINED_START_
#define _BOOST_UBLAS_REFINED_START_


#include <boost/numeric/ublas/matrix.hpp>

namespace boost { namespace numeric { namespace ublas {
Expand All @@ -24,8 +23,8 @@ namespace boost { namespace numeric { namespace ublas {
RefinedStart () {}

template <class MatrixType>
static Initialize (const MatrixType &data, const size_type num_clusters, const matrix<double> &centroids) {}
}
}
static void Initialize (const MatrixType &data, const size_type num_clusters, const matrix<double> &centroids) {}
};
}}}

#endif
11 changes: 5 additions & 6 deletions include/boost/numeric/ublas/kmeans/kmeans++.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,8 @@
// The authors gratefully acknowledge the support of
// GeNeSys mbH & Co. KG in producing this work.

#ifndef _BOOST_UBLAS_REFINED_START_
#define _BOOST_UBLAS_REFINED_START_

#ifndef _BOOST_UBLAS_KMEANSPLUSPLUS_
#define _BOOST_UBLAS_KMEANSPLUSPLUS_

#include <boost/numeric/ublas/matrix.hpp>

Expand All @@ -24,8 +23,8 @@ namespace boost { namespace numeric { namespace ublas {
KMeansPlusPlus () {}

template <class MatrixType>
static Initialize (const MatrixType &data, const size_type num_clusters, const matrix<double> &centroids) {}
}
}
static void Initialize (const MatrixType &data, const size_type num_clusters, const matrix<double> &centroids) {}
};
}}}

#endif
84 changes: 54 additions & 30 deletions include/boost/numeric/ublas/kmeans/kmeans.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,24 +14,29 @@
#ifndef _BOOST_UBLAS_KMEANS_
#define _BOOST_UBLAS_KMEANS_

#include <boost/numeric/ublas/vector.hpp>
#include <boost/numeric/ublas/vector_proxy.hpp>
#include <boost/numeric/ublas/matrix.hpp>
#include <boost/numeric/ublas/matrix_proxy.hpp>

#include <boost/numeric/ublas/kmeans/random_initialization.hpp>
#include <boost/numeric/ublas/kmeans/naive_kmeans.hpp>

namespace boost { namespace numeric { namespace ublas {

template </*class MetricType = EuclideanDistanceMetric,*/
class InitialPartitionPolicy = RandomInitialization,
class EvaluateStepType = NaiveKMeans,
class MatrixType = matrix<double>>
template<class> class EvaluateStepType = NaiveKMeans>
class KMeans {
public:
KMeans (const size_t max_iterations = 1000,
KMeans (const size_t max_iterations = 10,
/*const MetricType distance_metric = MetricType (),*/
const InitialPartitionPolicy partition_policy = InitialPartitionPolicy (),
const EvaluateStepType evaluation_step = EvaluateStepType ()) :
const InitialPartitionPolicy partition_policy = InitialPartitionPolicy ()) :
max_iterations (max_iterations),
/*distance_metric (distance_metric),*/
partition_policy (partition_policy) {}

template <class MatrixType>
void Cluster (const MatrixType &data,
const size_t num_clusters,
vector<int> &cluster_assignments) {
Expand All @@ -40,26 +45,32 @@ namespace boost { namespace numeric { namespace ublas {
Cluster (data, num_clusters, cluster_centroids, cluster_assignments);
}

template <class MatrixType>
void Cluster (const MatrixType &data,
const size_t num_clusters,
matrix<double> &cluster_centroids) {

GetInitialCentroids (data, num_clusters, centroids);
GetInitialCentroids (data, num_clusters, cluster_centroids);

// std::cout << "got init centroids!" << std::endl;

double norm = 0;
size_t num_iterations = 0;
bool convergence = false, iterations_finished = false;
matrix<double> new_centroids (centroids.size1 (), centroids.size2 ());
EvaluateStepType evaluation_step_type = EvaluateStepType (data/*, distance_metric*/);
matrix<double> new_cluster_centroids (cluster_centroids.size1 (), cluster_centroids.size2 ());
EvaluateStepType<const MatrixType> evaluation_step_type = EvaluateStepType<const MatrixType> (data/*, distance_metric*/);
// std::cout << "evaluate step init!" << std::endl;
do {
/*
We don't have to repeatedly copy new_centroids to centroids
We don't have to repeatedly copy new_cluster_centroids to cluster_centroids
because of this alternating step.
*/
// std::cout << "iterate " << num_iterations << "!" << std::endl;
if (num_iterations % 2)
norm = evaluation_step.Iterate (new_centroids, centroids);
norm = evaluation_step_type.Iterate (new_cluster_centroids, cluster_centroids);
else
norm = evaluation_step.Iterate (centroids, new_centroids);
norm = evaluation_step_type.Iterate (cluster_centroids, new_cluster_centroids);
// std::cout << "iterate " << num_iterations << "done!" << std::endl;
++ num_iterations;

if (num_iterations == max_iterations)
Expand All @@ -69,30 +80,43 @@ namespace boost { namespace numeric { namespace ublas {

} while (!convergence && !iterations_finished);

centroids = new_centroids;
// std::cout << "evaluate iterations done!" << std::endl;

std::cout << new_cluster_centroids.size1() << " " << new_cluster_centroids.size2 () << std::endl;
std::cout << cluster_centroids.size1() << " " << cluster_centroids.size2 () << std::endl;

cluster_centroids = new_cluster_centroids;
// for (size_t i = 0; i < cluster_centroids.size1 (); ++ i)
// for (size_t j = 0; j < cluster_centroids.size2 (); ++ j)
// cluster_centroids (i, j) = new_cluster_centroids (i, j);

// std::cout << "new centroids assigned!" << std::endl;
}

template <class MatrixType>
void Cluster (const MatrixType &data,
const size_t num_clusters,
matrix<double> &cluster_centroids,
vector<int> &cluster_assignments) {

Cluster (data, num_clusters, cluster_centroids);

// std::cout << "1st clusterd!" << std::endl;

for (size_t i = 0; i < data.size1 (); ++ i) {
matrix_row<MatrixType> data_row (data, i);
matrix_row<const MatrixType> data_row (data, i);
size_t assigned_cluster = 0;
/*
Use distance_metric.Apply () here, when implemented.
double min_centroid_distance = distance_metric.Apply (data_row, row (cluster_centroids, 0));
*/
double min_centroid_distance = inner_prod (data_row - row (cluster_centroids, 0));
double min_centroid_distance = inner_prod (data_row - row (cluster_centroids, 0), data_row - row (cluster_centroids, 0));
for (size_t j = 1; j < cluster_centroids.size1 (); ++ j) {
/*
Use distance_metric.Apply () here, when implemented.
double min_centroid_distance = distance_metric.Apply (data_row, row (cluster_centroids, 0));
*/
double distance = inner_prod (data_row - row (cluster_centroids, j));
double distance = inner_prod (data_row - row (cluster_centroids, j), data_row - row (cluster_centroids, j));
if (distance < min_centroid_distance) {
min_centroid_distance = distance;
assigned_cluster = j;
Expand All @@ -102,13 +126,13 @@ namespace boost { namespace numeric { namespace ublas {
}
}

size_t MaxIterations () const {
return max_iterations;
}
// size_t MaxIterations () const {
// return max_iterations;
// }

size_t& MaxIterations () const {
return max_iterations;
}
// size_t& MaxIterations () const {
// return max_iterations;
// }

/*MetricType DistanceMetric () {
return distance_metric;
Expand All @@ -118,21 +142,21 @@ namespace boost { namespace numeric { namespace ublas {
return distance_metric;
}*/

InitialPartitionPolicy PartitionPolicy () {
return partition_policy;
}
// InitialPartitionPolicy PartitionPolicy () {
// return partition_policy;
// }

InitialPartitionPolicy& PartitionPolicy () {
return partition_policy;
}
// InitialPartitionPolicy& PartitionPolicy () {
// return partition_policy;
// }

private:
/*
Modify this method to support partiotion policies which give
initial cluster assignments instead of cluster centroids.
*/
template <MatrixType = matrix<double> >
void GetInitialCentroids (const MatrixType &data, const size_t num_clusters, const matrix<double> &centroids) {
template <class MatrixType>
void GetInitialCentroids (const MatrixType &data, const size_t num_clusters, matrix<double> &centroids) {
partition_policy.Initialize (data, num_clusters, centroids);
}

Expand All @@ -141,6 +165,6 @@ namespace boost { namespace numeric { namespace ublas {
/*MetricType distance_metric;*/
InitialPartitionPolicy partition_policy;
};
}
}}}

#endif
68 changes: 53 additions & 15 deletions include/boost/numeric/ublas/kmeans/naive_kmeans.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,56 +14,94 @@
#ifndef _BOOST_UBLAS_NAIVE_KMEANS_
#define _BOOST_UBLAS_NAIVE_KMEANS_


#include <boost/numeric/ublas/vector.hpp>
#include <boost/numeric/ublas/vector_proxy.hpp>
#include <boost/numeric/ublas/matrix.hpp>
#include <boost/numeric/ublas/matrix_proxy.hpp>

#include <boost/math/special_functions/pow.hpp>

#include <cmath>

namespace boost { namespace numeric { namespace ublas {

template </* class MetricType,*/
class MatrixType = matrix<double> >
class NaiveKMeans {
public:
template <class MatrixType/*, class MetricType*/>
NaiveKMeans (MatrixType &data/*, MetricType distance_metric*/) :
/*distance_metric (distance_metric),*/
data (data) {}
NaiveKMeans (MatrixType &data_set/*, MetricType distance_metric*/) :
/*distance_metric (distance_metric),*/
data (data_set) {

// for (size_t i = 0; i < data_set.size1 (); ++ i)
// for (size_t j = 0; j < data_set.size2 (); ++ i)
// data (i, j) = data_set (i, j);
}

double Iterate (matrix<double> &centroids, matrix<double>& new_centroids) {
vector<int> data_points_per_centroid (centroids.size1 ());
vector<int> data_points_per_centroid = zero_vector<int>(centroids.size1 ());
new_centroids *= 0;

vector<int> cluster_assignments (data.size1 ());

// std::cout << "init centroids!" << centroids << std::endl;
// std::cout << "new centroids!" << new_centroids << std::endl;

for (size_t i = 0; i < data.size1 (); ++ i) {
matrix_row<MatrixType> data_row (data, i);
size_t assigned_cluster = 0;
/*
Use distance_metric.Apply () here, when implemented.
double min_centroid_distance = distance_metric.Apply (data_row, row (cluster_centroids, 0));
double min_centroid_distance = distance_metric.Apply (data_row, row (centroids, 0));
*/
double min_centroid_distance = inner_prod (data_row - row (cluster_centroids, 0));
for (size_t j = 1; j < cluster_centroids.size1 (); ++ j) {
double min_centroid_distance = inner_prod (data_row - row (centroids, 0), data_row - row (centroids, 0));
for (size_t j = 0; j < centroids.size1 (); ++ j) {
/*
Use distance_metric.Apply () here, when implemented.
double min_centroid_distance = distance_metric.Apply (data_row, row (cluster_centroids, 0));
double min_centroid_distance = distance_metric.Apply (data_row, row (centroids, 0));
*/
double distance = inner_prod (data_row - row (cluster_centroids, j));
double distance = inner_prod (data_row - row (centroids, j), data_row - row (centroids, j));
// std::cout << "distance from " << row (centroids, j) << " " << j << ": " << distance << std::endl;
if (distance < min_centroid_distance) {
min_centroid_distance = distance;
assigned_cluster = j;
}
}
// std::cout << "assigned cluster to " << i << ", " << data_row << ": " << assigned_cluster << std::endl;
cluster_assignments (i) = assigned_cluster;
new_centroids (assigned_cluster) += data_row;

row (new_centroids, assigned_cluster) += data_row;
// for (size_t i = 0; i < new_centroids.size2 (); ++i)
// new_centroids (assigned_cluster, i) += data_row (i);


data_points_per_centroid (assigned_cluster) += 1;
}

// std::cout << "data points per centroid: " << data_points_per_centroid << std::endl;

for (size_t i = 0; i < new_centroids.size1 (); ++ i)
new_centroids (i) /= data_points_per_centroid (i);
if (data_points_per_centroid (i))
row (new_centroids, i) /= data_points_per_centroid (i);
// for (size_t j = 0; j < new_centroids.size2 (); ++j)
// new_centroids (i, j) /= data_points_per_centroid (i);

/*
Use distance_metric.Apply () here, when implemented.
return distance_metric.Apply (centroids, new_centroids);
*/
double distance_norm = 0;
for (size_t i = 0; i < centroids.size2 (); ++ i) {
distance_norm += std::pow (inner_prod (column (centroids, i) - column (new_centroids, i), column (centroids, i) - column (new_centroids, i)), 2.0);
}
return std::pow (distance_norm, 0.5);

}

private:
MatrixType data;
/*MetricType distance_metric;*/
}
}
};
}}}

#endif
Loading

0 comments on commit 77d9d7f

Please sign in to comment.