The article is written in ISO C90
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
/*****
 ** kmeans.c
 ** - a simple k-means clustering routine
 ** - returns the cluster labels of the data points in an array
 ** - here's an example
 **   extern int *k_means(double**, int, int, int, double, double**);
 **   ...
 **   int *c = k_means(data_points, num_points, dim, 20, 1e-4, 0);
 **   for (i = 0; i < num_points; i++) {
 **      printf("data point %d is in cluster %d\n", i, c[i]);
 **   }
 **   ...
 **   free(c);
 ** Parameters
 ** - array of data points (double **data)
 ** - number of data points (int n)
 ** - dimension (int m)
 ** - desired number of clusters (int k)
 ** - error tolerance (double t)
 **   - used as the stopping criterion, i.e. when the sum of
 **     squared euclidean distance (standard error for k-means)
 **     of an iteration is within the tolerable range from that
 **     of the previous iteration, the clusters are considered
 **     "stable", and the function returns
 **   - a suggested value would be 0.0001
 ** - output address for the final centroids (double **centroids)
 **   - user must make sure the memory is properly allocated, or
 **     pass the null pointer if not interested in the centroids
 ** References
 ** - J. MacQueen, "Some methods for classification and analysis
 **   of multivariate observations", Fifth Berkeley Symposium on
 **   Math Statistics and Probability, 281-297, 1967.
 ** - I.S. Dhillon and D.S. Modha, "A data-clustering algorithm
 **   on distributed memory multiprocessors",
 **   Large-Scale Parallel Data Mining, 245-260, 1999.
 ** Notes
 ** - this function is provided as is with no warranty.
 ** - the author is not responsible for any damage caused
 **   either directly or indirectly by using this function.
 ** - anybody is free to do whatever he/she wants with this
 **   function as long as this header section is preserved.
 ** Created on 2005-04-12 by
 ** - Roger Zhang (rogerz@cs.dal.ca)
 ** Modifications
 ** -
 ** Last compiled under Linux with gcc-3
 */
 /*
 ** src: http://cs.smu.ca/~r_zhang/code/kmeans.c
 */

#
include < stdlib.h > #include < assert.h > #include < float.h > #include < math.h >

    int * k_means(double * * data, int n, int m, int k, double t, double * * centroids) {
        /* output cluster label for each data point */
        int * labels = (int * ) calloc(n, sizeof(int));

        int h, i, j; /* loop counters, of course :) */
        int * counts = (int * ) calloc(k, sizeof(int)); /* size of each cluster */
        double old_error, error = DBL_MAX; /* sum of squared euclidean distance */
        double * * c = centroids ? centroids : (double * * ) calloc(k, sizeof(double * ));
        double * * c1 = (double * * ) calloc(k, sizeof(double * )); /* temp centroids */

        assert(data && k > 0 && k <= n && m > 0 && t >= 0); /* for debugging */

        /****
         ** initialization */

        for (h = i = 0; i < k; h += n / k, i++) {
            c1[i] = (double * ) calloc(m, sizeof(double));
            if (!centroids) {
                c[i] = (double * ) calloc(m, sizeof(double));
            }
            /* pick k points as initial centroids */
            for (j = m; j-- > 0; c[i][j] = data[h][j]);
        }

        /****
         ** main loop */

        do {
            /* save error from last step */
            old_error = error, error = 0;

            /* clear old counts and temp centroids */
            for (i = 0; i < k; counts[i++] = 0) {
                for (j = 0; j < m; c1[i][j++] = 0);
            }

            for (h = 0; h < n; h++) {
                /* identify the closest cluster */
                double min_distance = DBL_MAX;
                for (i = 0; i < k; i++) {
                    double distance = 0;
                    for (j = m; j-- > 0; distance += pow(data[h][j] - c[i][j], 2));
                    if (distance < min_distance) {
                        labels[h] = i;
                        min_distance = distance;
                    }
                }
                /* update size and temp centroid of the destination cluster */
                for (j = m; j-- > 0; c1[labels[h]][j] += data[h][j]);
                counts[labels[h]]++;
                /* update standard error */
                error += min_distance;
            }

            for (i = 0; i < k; i++) { /* update all centroids */
                for (j = 0; j < m; j++) {
                    c[i][j] = counts[i] ? c1[i][j] / counts[i] : c1[i][j];
                }
            }

        } while (fabs(error - old_error) > t);

        /****
         ** housekeeping */

        for (i = 0; i < k; i++) {
            if (!centroids) {
                free(c[i]);
            }
            free(c1[i]);
        }

        if (!centroids) {
            free(c);
        }
        free(c1);

        free(counts);

        return labels;
    }