December 13, 2015 6:45 PM
聚类分析是一种静态数据分析方法,常被用于机器学习,模式识别,数据挖掘等领域。通常认为,聚类是一种无监督式的机器学习方法,它的过程是这样的:在未知样本类别的情况下,通过计算样本彼此间的距离(欧式距离, 马式距离,汉明距离,余弦距离等)来估计样本所属类别。从结构性来划分,聚类方法分为自上而下和自下而上两种方法,前者的算法是先把所有样本视为一类,然后不断从这个大类中分离出小类,直到不能再分为止;后者则相反,首先所有样本自成一类,然后不断两两合并,直到最终形成几个大类。
Kmeans 聚类是一种自下而上的聚类方法,它的优点是简单、速度快;缺点是聚类结果与初始中心的选择有关系,且必须提供聚类的数目。Kmeans 的第二个缺点是致命的,因为在有些时候,我们不知道样本集将要聚成多少个类别,这种时候 kmeans 是不适合的,推荐使用 hierarchical 或 meanshift 来聚类。第一个缺点可以通过多次聚类取最佳结果来解决。
Kmeans 的计算过程大概表示如下随机选择 k 个聚类中心. 最终的类别个数 <= k 计算每个样本到各个中心的距离每个样本聚类到离它最近的中心重新计算每个新类的中心重复以上步骤直到满足收敛要求。(通常就是中心点不再改变或满足一定迭代次数)。
以下为分别用 python 和 matlab 实现的代码,供大家学习和讨论。
python 版本:
- #Kmeans.py#2015.11.08#Email: wyxidian@gmail.com from pylab import * #Display the Chinese rcParams['font.sans-serif'] = ['SimHei'] rcParams['axes.unicode_minus'] = False kRange = range(2, 7)#data set of the possible k value,
- k is the number of class Length = 500#length of data x = zeros(Length) y = zeros(Length)#Generate date x[0 : 100] = randn(1, 100) + 1 y[0 : 100] = randn(1, 100) + 1 x[100 : 200] = randn(1, 100) + 6 y[100 : 200] = randn(1, 100) - 2 x[200 : 500] = randn(1, 300) + 5 y[200 : 500] = randn(1, 300) + 10 varResult = zeros(len(kRange)) figure() plot(x, y, 'o') title('Orignal Data')#
- try every possible value of k to classify the data
- for k in kRange: #Initial Center_x = zeros(k) Center_y = zeros(k) Center_x[0] = randn() Center_y[0] = randn() Last_Center_x = zeros(k) Last_Center_y = zeros(k) Sum_x = zeros(k) Sum_y = zeros(k) Num = zeros(k) index = zeros(Length) distance = zeros(k) distCenter = zeros(Length)#Chose the initial Center_x and Center_y
- for i in range(1, k) : for idxDate in range(Length) : distCenter[idxDate] = (Center_x[i - 1] - x[idxDate]) * *2 + (Center_y[i - 1] - y[idxDate]) * *2 sumDisCenter = sum(distCenter) randDis = randint(0, sumDisCenter) idx = 0
- while randDis - distCenter[idx] > 0 : randDis = randDis - distCenter[idx] idx += 1 Center_x[i] = x[idx] Center_y[i] = y[idx] print("Initial Center of each class") print("x:", Center_x) print("y:", Center_y)#Iteration#Stop the iteration when the center of every class is stable
- while (sum(sum((Center_x - Last_Center_x) * *2)) + sum(sum((Center_y - Last_Center_y) * *2))) > 1e-3: Sum_x = zeros(k);
- Sum_y = zeros(k) Num = zeros(k);
- for i in range(Length) : distance[0 : k] = (Center_x - x[i]) * *2 + (Center_y - y[i]) * *2;
- I = argsort(distance);
- index[i] = I[0];
- Sum_x[I[0]] = Sum_x[I[0]] + x[i];
- Sum_y[I[0]] = Sum_y[I[0]] + y[i];
- Num[I[0]] = Num[I[0]] + 1;
- for i in range(k) : Last_Center_x[i] = Center_x[i];
- Last_Center_y[i] = Center_y[i];
- Center_x[i] = Sum_x[I[i]] / Num[I[i]];
- Center_y[i] = Sum_y[I[i]] / Num[I[i]];
- result = []
- for i in range(k) : result.append(zeros([2, Num[i]])) Num = zeros(k);
- for i in range(Length) : for j in range(k) : if index[i] == j: result[j][0][Num[j]] = x[i] result[j][1][Num[j]] = y[i] Num[j] += 1
- else: pass
- for i in range(k) : varResult[k - kRange[0]] += (var (result[i][0][: ]) +
- var (result[i][1][: ])) varResult[k - kRange[0]] = varResult[k - kRange[0]] / k varResultRate = zeros(len(kRange)) for i in (kRange) : i -= kRange[0]
- if 0 == i: varResultRate[i] = 1 elif len(kRange) - 1 == i: varResultRate[i] = 1
- else: varResultRate[i] = (varResult[i] - varResult[i - 1]) / (varResult[i + 1] - varResult[i]) figure() print(varResult) print(varResultRate) plot(kRange, varResult, 'r-') I = argsort(varResultRate);
- k = kRange[I[ - 1]]#Initial Center_x = zeros(k) Center_y = zeros(k) Center_x[0] = randn() Center_y[0] = randn() Last_Center_x = zeros(k) Last_Center_y = zeros(k) Sum_x = zeros(k) Sum_y = zeros(k) Num = zeros(k) index = zeros(Length) distance = zeros(k) distCenter = zeros(Length)#Chose the initial Center_x and Center_y
- for i in range(1, k) : for idxDate in range(Length) : distCenter[idxDate] = (Center_x[i - 1] - x[idxDate]) * *2 + (Center_y[i - 1] - y[idxDate]) * *2 sumDisCenter = sum(distCenter) randDis = randint(0, sumDisCenter) idx = 0
- while randDis - distCenter[idx] > 0 : randDis = randDis - distCenter[idx] idx += 1 Center_x[i] = x[idx] Center_y[i] = y[idx] print("Initial Center of each class") print("x:", Center_x) print("y:", Center_y)#Iteration
- while (sum(sum((Center_x - Last_Center_x) * *2)) + sum(sum((Center_y - Last_Center_y) * *2))) > 1e-3: Sum_x = zeros(k);
- Sum_y = zeros(k) Num = zeros(k);
- for i in range(Length) : distance[0 : k] = (Center_x - x[i]) * *2 + (Center_y - y[i]) * *2;
- I = argsort(distance);
- index[i] = I[0];
- Sum_x[I[0]] = Sum_x[I[0]] + x[i];
- Sum_y[I[0]] = Sum_y[I[0]] + y[i];
- Num[I[0]] = Num[I[0]] + 1;
- for i in range(k) : Last_Center_x[i] = Center_x[i];
- Last_Center_y[i] = Center_y[i];
- Center_x[i] = Sum_x[I[i]] / Num[I[i]];
- Center_y[i] = Sum_y[I[i]] / Num[I[i]];
- result = []
- for i in range(k) : result.append(zeros([2, Num[i]]))'''result = [zeros([2, Num[0]]),
- zeros([2, Num[1]]),
- zeros([2, Num[2]])]'''Num = zeros(k);
- for i in range(Length) : for j in range(k) : if index[i] == j: result[j][0][Num[j]] = x[i] result[j][1][Num[j]] = y[i] Num[j] += 1
- else: pass figure() plot(result[0][0][: ], result[0][1][: ], 'ro', result[1][0][: ], result[1][1][: ], 'go', result[2][0][: ], result[2][1][: ], 'co') plot(Center_x, Center_y, 'ks');
- title('Result') show()
matlab 版本:
- clear all;
- close all;
- clc;
- k = 3;
- x(1 : 100) = randn(1, 100) + 1;
- y(1 : 100) = randn(1, 100) + 1;
- x(101 : 200) = randn(1, 100) + 6;
- y(101 : 200) = randn(1, 100) - 2;
- x(201 : 500) = randn(1, 300) + 5;
- y(201 : 500) = randn(1, 300) + 10;
- Length = length(x);
- figure();
- plot(x, y, 'o') title('原始数据') % %初始化 % %--------------using k - means++to abtain k initialvalues-------%%seed = zeros(k, 2);
- seed(1, 1) = rand;
- seed(1, 2) = rand;
- for i = 1 : (k - 1) for j = 1 : length(x) distance(j) = sqrt((x(j) - seed(i, 1)) ^ 2 + (y(j) - seed(i, 2)) ^ 2);
- end sumDistance = sum(distance);
- random = sumDistance * rand;
- for j = 1 : length(x) random = random - distance(j);
- if random <= 0 seed(i + 1, 1) = x(j);
- seed(i + 1, 2) = y(j);
- break;
- end end end Center_x = (seed(: , 1))';
- Center_y = (seed(:,2))';
- Last_Center_x = zeros(1, 3);
- Last_Center_y = zeros(1, 3);
- Sum_x = zeros(1, 3);
- Sum_y = zeros(1, 3);
- Num = zeros(1, 3);
- index = zeros(1, Length); % %迭代知道每一个类的中心基本保持不变
- while (sum(sum((Center_x - Last_Center_x). ^ 2)) + sum(sum((Center_y - Last_Center_y). ^ 2))) > 1e-3 Sum_x = zeros(1, 3);
- Sum_y = zeros(1, 3);
- Num = zeros(1, 3);
- for i = 1 : Length distance = (Center_x - x(i)). ^ 2 + (Center_y - y(i)). ^ 2; [C, I] = sort(distance);
- index(i) = I(1);
- Sum_x(I(1)) = Sum_x(I(1)) + x(i);
- Sum_y(I(1)) = Sum_y(I(1)) + y(i);
- Num(I(1)) = Num(I(1)) + 1;
- end
- for i = 1 : k Last_Center_x(i) = Center_x(i);
- Last_Center_y(i) = Center_y(i);
- Center_x(i) = Sum_x(I(i)) / Num(I(i));
- Center_y(i) = Sum_y(I(i)) / Num(I(i));
- end end figure(); % 分类结果result1 = zeros(2, Num(1));
- result2 = zeros(2, Num(2));
- result3 = zeros(2, Num(3));
- Num = zeros(1, 3);
- for i = 1 : Length
- switch index(i)
- case 1 Num(1) = Num(1) + 1;
- result1(: , Num(1)) = [x(i); y(i)];
- case 2 Num(2) = Num(2) + 1;
- result2(: , Num(2)) = [x(i); y(i)];
- case 3 Num(3) = Num(3) + 1;
- result3(: , Num(3)) = [x(i); y(i)];
- end end plot(result1(1, :), result1(2, :), 'ro', result2(1, :), result2(2, :), 'go', result3(1, :), result3(2, :), 'co');
- title('分类结果') hold on;
- plot(Center_x, Center_y, 'ks');
结果展示:
第一幅图为待分类数据,需要注意的是,我们事先并不知道需要分成几个类;
第二幅图为选取不同分类个数,得出最终结果的方差,我们可以看到,在最理想的分类个数为 3 时,曲线出现拐点,因此,我们只要找到这个拐点,进行分类即可;
第三幅图是分类结果
来源: http://lib.csdn.net/article/machinelearning/37049