- 用于聚类的数据集
- % matplotlib inline import scipy.io as sio import matplotlib.pyplot as plt '''
- 各种聚类数据
- '''#two_cluster def two_cluster() : two_cluster = u 'cluster_data/two_cluster.mat' two_cluster = sio.loadmat(two_cluster)['X'].T data = two_cluster
- return data#three_cluster def three_cluster() : path = u 'cluster_data/three_cluster.mat' three_cluster = sio.loadmat(path)['X'].T data = three_cluster
- return data#five_cluster def five_cluster() : path = u 'cluster_data/five_cluster.mat' five_cluster = sio.loadmat(path) x = five_cluster['x']#得到的数据为二行n列 y = five_cluster['y']#到的数据为一行n列 data = np.vstack((x, y)).T#先垂直合并,而后转置 #data = np.array([x[0, :], x[1, :], y[0, :]]).T#list与array互换
- return data#spiral def spiral() : path = u 'cluster_data/spiral.mat' spiral = sio.loadmat(path)['spiral'] spiral = spiral[0 : :3, :]#每隔3行取一个数据 data = spiral data = np.array([data[: , 1], data[: , 2], data[: , 0]]).T#list与array互换
- return data#spiral_unbalance def spiral_unbalance() : path = u 'cluster_data/spiral_unbalance.mat' spiral_unbalance = sio.loadmat(path)['spiral_unbalance'] spiral_unbalance = spiral_unbalance[0 : :3, :]#每隔3行取一个数据 data = spiral_unbalance data = np.array([data[: , 1], data[: , 2], data[: , 0]]).T#list与array互换
- return data#ThreeCircles def ThreeCircles() : path = u 'cluster_data/ThreeCircles.mat' ThreeCircles = sio.loadmat(path)['ThreeCircles'] ThreeCircles = ThreeCircles[0 : :3, :]#每隔3行取一个数据 data = ThreeCircles data = np.array([data[: , 1], data[: , 2], data[: , 0]]).T#list与array互换
- return data#Twomoons def Twomoons() : path = u 'cluster_data/Twomoons.mat' Twomoons = sio.loadmat(path)['Twomoons'] Twomoons = Twomoons[0 : :3, :]#每隔3行取一个数据 data = Twomoons data = np.array([data[: , 1], data[: , 2], data[: , 0]]).T#list与array互换 plt.scatter(data[: , 0], data[: , 1], c = data[: , 2])
- return data#Twomoons1 def Twomoons1() : path = u 'cluster_data/Twomoons.mat' Twomoons1 = sio.loadmat(path)['Twomoons'] Twomoons1 = Twomoons1[0 : :3, :]#每隔3行取一个数据 data = Twomoons1 data = np.array([data[: , 1], data[: , 2], data[: , 0]]).T#list与array互换
- return data def test() : print 'test'def show_all() : plt.figure(figsize = (16, 8)) #动态调用方法 func_name_list = ['two_cluster', 'three_cluster', 'five_cluster', 'spiral', 'spiral_unbalance', 'ThreeCircles', 'Twomoons', 'Twomoons1']
- for i in range(8) : data_list.append(eval(func_name_list[i])()) #动态画图
- for i in range(8) : data = data_list[i] plt.subplot(2, 4, i + 1) #plt.figure() plt.scatter(data[: , 0], data[: , 1], c = data[: , 2]) data_list = [] show_all()
- 使用scikit的kmeans进行聚类
- % matplotlib inline import scipy.io as sio#matlab文件名two_cluster = u 'cluster_data/two_cluster.mat'data = sio.loadmat(two_cluster) print data
- % matplotlib inline import matplotlib.pyplot as plt x = data['X'] cValue = x[2] plt.scatter(x[0], x[1], c = cValue)
- from sklearn import cluster,
- datasets b = np.array(x).T b = b[: , 0 : 2] y_pred = cluster.KMeans(n_clusters = 2, random_state = 170).fit_predict(b) cValue = x[2] plt.scatter(x[0], x[1], c = y_pred)
数据集下载
- % matplotlib inline import scipy.io as sio#matlab文件名two_cluster = u 'cluster_data/spiral.mat'spiral = sio.loadmat(two_cluster)['spiral'] spiral = spiral[0 : :3, :]#每隔3行取一个数据print len(spiral),
- len(spiral[0]) cValue = spiral[: , 0] print cValue.shape color = ['b', 'y'] cValue = [color[int(i)]
- for i in list(cValue)] plt.scatter(spiral[: , 1], spiral[: , 2], c = cValue)
使用 kmeans 结果
- from sklearn import cluster,
- datasets y_pred = cluster.KMeans(n_clusters = 2, random_state = 170).fit_predict(spiral[: , 1 : 3]) plt.scatter(spiral[: , 1], spiral[: , 2], c = y_pred)
使用 scipy 进行聚类效果
- # - *-coding: utf8 - *-%matplotlib inline import scipy.io as sio import matplotlib.pyplot as plt import scipy.cluster.hierarchy as hcluster from sklearn.cluster import AgglomerativeClustering import numpy.random as random import numpy as np import numpy.core.fromnumeric def loadData() : #matlab文件名 two_cluster = u 'cluster_data/spiral.mat' spiral = sio.loadmat(two_cluster)['spiral'] spiral = spiral[0 : :3, :]#每隔3行取一个数据 print len(spiral),
- len(spiral[0]) cValue = spiral[: , 0] print cValue.shape color = ['b', 'y'] cValue = [color[int(i)]
- for i in list(cValue)] plt.scatter(spiral[: , 1], spiral[: , 2], c = cValue) def spiralSample() : plt.subplot(131) plt.title(u 'origal data') plt.scatter(spiral[: , 1], spiral[: , 2], c = spiral[: , 0]) #scipy进行聚类,
- 默认depth = 2(可得到两类),阈值t为距离阈值,设置criterion = 'maxclust',
- 找到两类之间最小距离小于t的进行合并 #http: //docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.fcluster.html#scipy.cluster.hierarchy.fcluster
- y_pred = hcluster.fclusterdata(spiral[: , 1 : 3], criterion = 'maxclust', t = 2) plt.subplot(132) plt.title(u 'use scipy to hierarchy cluster') plt.scatter(spiral[: , 1], spiral[: , 2], c = y_pred) #scikit进行聚类 plt.subplot(133) plt.title(u 'use scikit to hierarchy cluster') y_pred = AgglomerativeClustering(n_clusters = 2, linkage = 'ward').fit_predict(spiral[: , 1 : 3]) plt.scatter(spiral[: , 1], spiral[: , 2], c = y_pred) plt.show() spiralSample()
来源: http://lib.csdn.net/article/python/44491