-
Notifications
You must be signed in to change notification settings - Fork 12
/
kmeans.h
125 lines (101 loc) · 3.81 KB
/
kmeans.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
/*-------------------------------------------------------------------------
*
* kmeans.h
* Generic k-means implementation
*
* Copyright (c) 2016, Paul Ramsey <pramsey@cleverelephant.ca>
*
*------------------------------------------------------------------------*/
#include <stdlib.h>
/*
* Simple k-means implementation for arbitrary data structures
*
* Since k-means partitions based on inter-object "distance" the same
* machinery can be used to support any object type that can calculate a
* "distance" between pairs.
*
* To use the k-means infrastructure, just fill out the kmeans_config
* structure and invoke the kmeans() function.
*/
/*
* Threaded calculation is available using pthreads, which practically
* means UNIX platforms only, unless you're building with a posix
* compatible environment.
*
* #define KMEANS_THREADED
*/
/*
* When clustering lists with NULL elements, they will get this as
* their cluster number. (All the other clusters will be non-negative)
*/
#define KMEANS_NULL_CLUSTER -1
/*
* If the algorithm doesn't converge within this number of iterations,
* it will return with a failure error code.
*/
#define KMEANS_MAX_ITERATIONS 1000
/*
* The code doesn't try to figure out how many threads to use, so
* best to set this to the number of cores you expect to have
* available. The threshold is the the value of k*n at which to
* move to multi-threading.
*/
#ifdef KMEANS_THREADED
#define KMEANS_THR_MAX 4
#define KMEANS_THR_THRESHOLD 250000
#endif
#define kmeans_malloc(size) malloc(size)
#define kmeans_free(ptr) free(ptr)
typedef void * Pointer;
typedef enum {
KMEANS_OK,
KMEANS_EXCEEDED_MAX_ITERATIONS,
KMEANS_ERROR
} kmeans_result;
/*
* Prototype for the distance calculating function
*/
typedef double (*kmeans_distance_method) (const Pointer a, const Pointer b);
/*
* Prototype for the centroid calculating function
* @param objs the list of all objects in the calculation
* @param clusters the list of cluster numbers for each object
* @param num_objs the number of objects/cluster numbers in the previous arrays
* @param cluster the cluster number we are actually generating a centroid for here
* @param centroid the object to write the centroid result into (already allocated)
*/
typedef void (*kmeans_centroid_method) (const Pointer * objs, const int * clusters, size_t num_objs, int cluster, Pointer centroid);
typedef struct kmeans_config
{
/* Function returns the "distance" between any pair of objects */
kmeans_distance_method distance_method;
/* Function returns the "centroid" of a collection of objects */
kmeans_centroid_method centroid_method;
/* An array of objects to be analyzed. User allocates this array */
/* and is responsible for freeing it. */
/* For objects that are not capable of participating in the distance */
/* calculations, but for which you still want included in the process */
/* (for examples, database nulls, or geometry empties) use a NULL */
/* value in this list. All NULL values will be returned in the */
/* KMEANS_NULL_CLUSTER. */
Pointer * objs;
/* Number of objects in the preceding array */
size_t num_objs;
/* An array of inital centers for the algorithm */
/* Can be randomly assigned, or using proportions, */
/* unfortunately the algorithm is sensitive to starting */
/* points, so using a "better" set of starting points */
/* might be wise. User allocates and is responsible for freeing. */
Pointer * centers;
/* Number of means we are calculating, length of preceding array */
unsigned int k;
/* Maximum number of times to iterate the algorithm, or 0 for */
/* library default */
unsigned int max_iterations;
/* Iteration counter */
unsigned int total_iterations;
/* Array to fill in with cluster numbers. User allocates and frees. */
int * clusters;
} kmeans_config;
/* This is where the magic happens. */
kmeans_result kmeans(kmeans_config *config);