Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
vmarkovtsev committed Jan 26, 2017
2 parents 2617e0e + 72eb2b6 commit 8ef9a8f
Show file tree
Hide file tree
Showing 12 changed files with 548 additions and 128 deletions.
5 changes: 4 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,7 @@ before_script:
- cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-8.0 ..

script:
- make -j2 VERBOSE=1
- make -j2 VERBOSE=1

notifications:
email: false
3 changes: 2 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ if (NOT DEFINED CUDA_ARCH)
set(CUDA_ARCH "61")
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native -Wall -Werror -DCUDA_ARCH=${CUDA_ARCH} -std=c++11 ${OpenMP_CXX_FLAGS}")
set(SOURCE_FILES kmcuda.cc kmcuda.h wrappers.h private.h fp_abstraction.h
set(SOURCE_FILES kmcuda.cc kmcuda.h wrappers.h private.h fp_abstraction.h tricks.cuh
metric_abstraction.h kmeans.cu knn.cu)
if (NOT DISABLE_PYTHON)
list(APPEND SOURCE_FILES python.cc)
Expand All @@ -29,6 +29,7 @@ if (CMAKE_MAJOR_VERSION LESS 4 AND CMAKE_MINOR_VERSION LESS 3)
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std c++11")
endif()
cuda_add_library(KMCUDA SHARED ${SOURCE_FILES} OPTIONS ${NVCC_FLAGS})
target_link_libraries(KMCUDA ${CUDA_curand_LIBRARY})
if(PYTHONLIBS_FOUND)
include_directories(${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDES})
target_link_libraries(KMCUDA ${PYTHON_LIBRARIES})
Expand Down
9 changes: 6 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,17 @@ have several days and 12 GB of GPU memory); 300K samples are grouped
into 5000 clusters in 4½ minutes on NVIDIA Titan X (15 iterations); 3M samples
and 1000 clusters take 20 minutes (33 iterations). Yinyang can be
turned off to save GPU memory but the slower Lloyd will be used then.
Three centroid initialization schemes are supported: random, k-means++ and import.
Two distance metrics are supported: L2 (the usual one) and angular (refined cosine).
Four centroid initialization schemes are supported: random, k-means++,
[AFKMC2](http://olivierbachem.ch/files/afkmcmc-oral-pdf.pdf) and import.
Two distance metrics are supported: L2 (the usual one) and angular
(arccos of the scalar product). L1 is in development.
16-bit float support delivers 2x memory compression. If you've got several GPUs,
they can be utilized together and it gives the corresponding linear speedup
either for Lloyd or Yinyang.

The code has been thoroughly tested to yield bit-to-bit identical
results from Yinyang and Lloyd.
results from Yinyang and Lloyd. AFKMC2 was converted from
[the reference code](https://github.com/obachem/kmc2).

Read the articles: [1](http://blog.sourced.tech/post/towards_kmeans_on_gpu/),
[2](https://blog.sourced.tech/post/kmcuda4/).
Expand Down
92 changes: 75 additions & 17 deletions kmcuda.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
#include <cstdlib>
#include <cstring>
#include <cinttypes>
#include <cfloat>
#include <cmath>
#include <cassert>
#include <algorithm>
#include <map>
#include <memory>

#include <cuda_runtime_api.h>
Expand Down Expand Up @@ -189,11 +189,11 @@ static KMCUDAResult print_memory_stats(const std::vector<int> &devs) {
extern "C" {

KMCUDAResult kmeans_init_centroids(
KMCUDAInitMethod method, uint32_t samples_size, uint16_t features_size,
uint32_t clusters_size, KMCUDADistanceMetric metric, uint32_t seed,
const std::vector<int> &devs, int device_ptrs, int fp16x2, int32_t verbosity,
const float *host_centroids, const udevptrs<float> &samples,
udevptrs<float> *dists, udevptrs<float> *dev_sums, udevptrs<float> *centroids) {
KMCUDAInitMethod method, const void *init_params, uint32_t samples_size,
uint16_t features_size, uint32_t clusters_size, KMCUDADistanceMetric metric,
uint32_t seed, const std::vector<int> &devs, int device_ptrs, int fp16x2,
int32_t verbosity, const float *host_centroids, const udevptrs<float> &samples,
udevptrs<float> *dists, udevptrs<float> *aux, udevptrs<float> *centroids) {
srand(seed);
switch (method) {
case kmcudaInitMethodImport:
Expand Down Expand Up @@ -237,7 +237,7 @@ KMCUDAResult kmeans_init_centroids(
);
break;
}
case kmcudaInitMethodPlusPlus:
case kmcudaInitMethodPlusPlus: {
INFO("performing kmeans++...\n");
float smoke = NAN;
uint32_t first_offset;
Expand All @@ -253,9 +253,9 @@ KMCUDAResult kmeans_init_centroids(
printf("kmeans++: dump %" PRIu32 " %" PRIu32 " %p\n",
samples_size, features_size, host_dists.get());
FOR_EACH_DEVI(
printf("kmeans++: dev #%d: %p %p %p %p\n", devs[devi],
printf("kmeans++: dev #%d: %p %p %p\n", devs[devi],
samples[devi].get(), (*centroids)[devi].get(),
(*dists)[devi].get(), (*dev_sums)[devi].get());
(*dists)[devi].get());
);
}
for (uint32_t i = 1; i < clusters_size; i++) {
Expand All @@ -267,7 +267,7 @@ KMCUDAResult kmeans_init_centroids(
float dist_sum = 0;
RETERR(kmeans_cuda_plus_plus(
samples_size, features_size, i, metric, devs, fp16x2, verbosity,
samples, centroids, dists, dev_sums, host_dists.get(), &dist_sum),
samples, centroids, dists, host_dists.get(), &dist_sum),
DEBUG("\nkmeans_cuda_plus_plus failed\n"));
if (dist_sum != dist_sum) {
assert(dist_sum == dist_sum);
Expand Down Expand Up @@ -307,21 +307,79 @@ KMCUDAResult kmeans_init_centroids(
(j - 1) * features_size, features_size);
}
break;
}
case kmcudaInitMethodAFKMC2: {
uint32_t m = *reinterpret_cast<const uint32_t*>(init_params);
if (m == 0) {
m = 200;
} else if (m > samples_size / 2) {
INFO("afkmc2: m > %" PRIu32 " is not supported (got %" PRIu32 ")\n",
samples_size / 2, m);
return kmcudaInvalidArguments;
}
float smoke = NAN;
uint32_t first_offset;
while (smoke != smoke) {
first_offset = (rand() % samples_size) * features_size;
cudaSetDevice(devs[0]);
CUCH(cudaMemcpy(&smoke, samples[0].get() + first_offset, sizeof(float),
cudaMemcpyDeviceToHost), kmcudaMemoryCopyError);
}
INFO("afkmc2: calculating q (c0 = %" PRIu32 ")... ",
first_offset / features_size);
CUMEMCPY_D2D_ASYNC(*centroids, 0, samples, first_offset, features_size);
auto q = std::unique_ptr<float[]>(new float[samples_size]);
kmeans_cuda_afkmc2_calc_q(
samples_size, features_size, first_offset / features_size, metric,
devs, fp16x2, verbosity, samples, dists, q.get());
INFO("done\n");
auto cand_ind = std::unique_ptr<uint32_t[]>(new uint32_t[m]);
auto rand_a = std::unique_ptr<float[]>(new float[m]);
auto p_cand = std::unique_ptr<float[]>(new float[m]);
for (uint32_t k = 1; k < clusters_size; k++) {
if (verbosity > 1 || (verbosity > 0 && (
clusters_size < 100 || k % (clusters_size / 100) == 0))) {
printf("\rstep %d", k);
fflush(stdout);
}
RETERR(kmeans_cuda_afkmc2_random_step(
k, m, seed, verbosity, dists->back().get(),
reinterpret_cast<uint32_t*>(aux->back().get()),
cand_ind.get(), aux->back().get() + m, rand_a.get()));
RETERR(kmeans_cuda_afkmc2_min_dist(
k, m, metric, fp16x2, verbosity, samples.back().get(),
reinterpret_cast<uint32_t*>(aux->back().get()),
centroids->back().get(), aux->back().get() + m, p_cand.get()));
float curr_prob = 0;
uint32_t curr_ind = 0;
for (uint32_t j = 0; j < m; j++) {
auto cand_prob = p_cand[j] / q[cand_ind[j]];
if (curr_prob == 0 || cand_prob / curr_prob > rand_a[j]) {
curr_ind = j;
curr_prob = cand_prob;
}
}
CUMEMCPY_D2D_ASYNC(*centroids, k * features_size,
samples, cand_ind[curr_ind] * features_size,
features_size);
}
break;
}
}
INFO("\rdone \n");
return kmcudaSuccess;
}

KMCUDAResult kmeans_cuda(
KMCUDAInitMethod init, float tolerance, float yinyang_t,
KMCUDAInitMethod init, const void *init_params, float tolerance, float yinyang_t,
KMCUDADistanceMetric metric, uint32_t samples_size, uint16_t features_size,
uint32_t clusters_size, uint32_t seed, uint32_t device, int32_t device_ptrs,
int32_t fp16x2, int32_t verbosity, const float *samples, float *centroids,
uint32_t *assignments, float *average_distance) {
DEBUG("arguments: %d %.3f %.2f %d %" PRIu32 " %" PRIu16 " %" PRIu32 " %"
PRIu32 " %" PRIu32 " %d %" PRIi32 " %p %p %p %p\n", init, tolerance,
yinyang_t, metric, samples_size, features_size, clusters_size, seed,
device, fp16x2, verbosity, samples, centroids, assignments,
DEBUG("arguments: %d %p %.3f %.2f %d %" PRIu32 " %" PRIu16 " %" PRIu32 " %"
PRIu32 " %" PRIu32 " %d %" PRIi32 " %p %p %p %p\n", init, init_params,
tolerance, yinyang_t, metric, samples_size, features_size, clusters_size,
seed, device, fp16x2, verbosity, samples, centroids, assignments,
average_distance);
RETERR(check_kmeans_args(
tolerance, yinyang_t, samples_size, features_size, clusters_size,
Expand Down Expand Up @@ -392,8 +450,8 @@ KMCUDAResult kmeans_cuda(
FOR_EACH_DEV(cudaProfilerStart());
#endif
RETERR(kmeans_init_centroids(
init, samples_size, features_size, clusters_size, metric, seed, devs,
device_ptrs, fp16x2, verbosity, centroids, device_samples,
init, init_params, samples_size, features_size, clusters_size, metric,
seed, devs, device_ptrs, fp16x2, verbosity, centroids, device_samples,
reinterpret_cast<udevptrs<float>*>(&device_assignments),
reinterpret_cast<udevptrs<float>*>(&device_assignments_prev),
&device_centroids),
Expand Down
7 changes: 6 additions & 1 deletion kmcuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ enum KMCUDAResult {
enum KMCUDAInitMethod {
kmcudaInitMethodRandom = 0,
kmcudaInitMethodPlusPlus,
kmcudaInitMethodAFKMC2,
kmcudaInitMethodImport
};

Expand All @@ -27,6 +28,10 @@ extern "C" {

/// @brief Performs K-means clustering on GPU / CUDA.
/// @param init centroids initialization method.
/// @param init_params pointer to a struct / number with centroid initialization
/// parameters. Ignored unless init == kmcudaInitMethodAFKMC2.
/// In case with kmcudaInitMethodAFKMC2 it is expected to be
/// uint32_t* to m; m == 0 means the default value (200).
/// @param tolerance if the number of reassignments drop below this ratio, stop.
/// @param yinyang_t the relative number of cluster groups, usually 0.1.
/// @param metric the distance metric to use. The default is Euclidean (L2), can be
Expand All @@ -53,7 +58,7 @@ extern "C" {
/// the corresponding centroids. If nullptr, not calculated.
/// @return KMCUDAResult.
KMCUDAResult kmeans_cuda(
KMCUDAInitMethod init, float tolerance, float yinyang_t,
KMCUDAInitMethod init, const void *init_params, float tolerance, float yinyang_t,
KMCUDADistanceMetric metric, uint32_t samples_size, uint16_t features_size,
uint32_t clusters_size, uint32_t seed, uint32_t device, int32_t device_ptrs,
int32_t fp16x2, int32_t verbosity, const float *samples, float *centroids,
Expand Down
Loading

0 comments on commit 8ef9a8f

Please sign in to comment.