python进行聚类(scikit-lean、scipy)

用于聚类的数据集

% matplotlib inline import scipy.io as sio import matplotlib.pyplot as plt '''
各种聚类数据
'''#two_cluster def two_cluster() :   two_cluster = u 'cluster_data/two_cluster.mat'  two_cluster = sio.loadmat(two_cluster)['X'].T  data = two_cluster  
return data#three_cluster def three_cluster() :   path = u 'cluster_data/three_cluster.mat'  three_cluster = sio.loadmat(path)['X'].T  data = three_cluster    
return data#five_cluster def five_cluster() :   path = u 'cluster_data/five_cluster.mat'  five_cluster = sio.loadmat(path)  x = five_cluster['x']#得到的数据为二行n列  y = five_cluster['y']#到的数据为一行n列  data = np.vstack((x, y)).T#先垂直合并，而后转置  #data = np.array([x[0, :], x[1, :], y[0, :]]).T#list与array互换  
return data#spiral def spiral() :   path = u 'cluster_data/spiral.mat'  spiral = sio.loadmat(path)['spiral']  spiral = spiral[0 : :3, :]#每隔3行取一个数据  data = spiral  data = np.array([data[: , 1], data[: , 2], data[: , 0]]).T#list与array互换  
return data#spiral_unbalance def spiral_unbalance() :   path = u 'cluster_data/spiral_unbalance.mat'  spiral_unbalance = sio.loadmat(path)['spiral_unbalance']  spiral_unbalance = spiral_unbalance[0 : :3, :]#每隔3行取一个数据  data = spiral_unbalance  data = np.array([data[: , 1], data[: , 2], data[: , 0]]).T#list与array互换  
return data#ThreeCircles def ThreeCircles() :   path = u 'cluster_data/ThreeCircles.mat'  ThreeCircles = sio.loadmat(path)['ThreeCircles']  ThreeCircles = ThreeCircles[0 : :3, :]#每隔3行取一个数据  data = ThreeCircles  data = np.array([data[: , 1], data[: , 2], data[: , 0]]).T#list与array互换  
return data#Twomoons def Twomoons() :   path = u 'cluster_data/Twomoons.mat'  Twomoons = sio.loadmat(path)['Twomoons']  Twomoons = Twomoons[0 : :3, :]#每隔3行取一个数据  data = Twomoons  data = np.array([data[: , 1], data[: , 2], data[: , 0]]).T#list与array互换  plt.scatter(data[: , 0], data[: , 1], c = data[: , 2])  
return data#Twomoons1 def Twomoons1() :   path = u 'cluster_data/Twomoons.mat'  Twomoons1 = sio.loadmat(path)['Twomoons']  Twomoons1 = Twomoons1[0 : :3, :]#每隔3行取一个数据  data = Twomoons1  data = np.array([data[: , 1], data[: , 2], data[: , 0]]).T#list与array互换  
return data def test() :   print 'test'def show_all() :   plt.figure(figsize = (16, 8))  #动态调用方法  func_name_list = ['two_cluster', 'three_cluster', 'five_cluster', 'spiral', 'spiral_unbalance', 'ThreeCircles', 'Twomoons', 'Twomoons1']  
for i in range(8) :     data_list.append(eval(func_name_list[i])())  #动态画图  
for i in range(8) :     data = data_list[i]    plt.subplot(2, 4, i + 1)    #plt.figure()    plt.scatter(data[: , 0], data[: , 1], c = data[: , 2])   data_list = [] show_all()

使用scikit的kmeans进行聚类

% matplotlib inline import scipy.io as sio#matlab文件名two_cluster = u 'cluster_data/two_cluster.mat'data = sio.loadmat(two_cluster) print data

% matplotlib inline import matplotlib.pyplot as plt x = data['X'] cValue = x[2] plt.scatter(x[0], x[1], c = cValue)

from sklearn import cluster,
datasets b = np.array(x).T b = b[: , 0 : 2] y_pred = cluster.KMeans(n_clusters = 2, random_state = 170).fit_predict(b) cValue = x[2] plt.scatter(x[0], x[1], c = y_pred)

数据集下载

% matplotlib inline import scipy.io as sio#matlab文件名two_cluster = u 'cluster_data/spiral.mat'spiral = sio.loadmat(two_cluster)['spiral'] spiral = spiral[0 : :3, :]#每隔3行取一个数据print len(spiral),
len(spiral[0]) cValue = spiral[: , 0] print cValue.shape color = ['b', 'y'] cValue = [color[int(i)]
for i in list(cValue)] plt.scatter(spiral[: , 1], spiral[: , 2], c = cValue)

使用 kmeans 结果

from sklearn import cluster,
datasets y_pred = cluster.KMeans(n_clusters = 2, random_state = 170).fit_predict(spiral[: , 1 : 3]) plt.scatter(spiral[: , 1], spiral[: , 2], c = y_pred)

使用 scipy 进行聚类效果

# - *-coding: utf8 - *-%matplotlib inline import scipy.io as sio import matplotlib.pyplot as plt import scipy.cluster.hierarchy as hcluster from sklearn.cluster import AgglomerativeClustering import numpy.random as random import numpy as np import numpy.core.fromnumeric def loadData() :   #matlab文件名   two_cluster = u 'cluster_data/spiral.mat'  spiral = sio.loadmat(two_cluster)['spiral']  spiral = spiral[0 : :3, :]#每隔3行取一个数据  print len(spiral),
len(spiral[0])  cValue = spiral[: , 0]  print cValue.shape  color = ['b', 'y']  cValue = [color[int(i)]
for i in list(cValue)]  plt.scatter(spiral[: , 1], spiral[: , 2], c = cValue) def spiralSample() :   plt.subplot(131)  plt.title(u 'origal data')  plt.scatter(spiral[: , 1], spiral[: , 2], c = spiral[: , 0])  #scipy进行聚类,
默认depth = 2（可得到两类），阈值t为距离阈值，设置criterion = 'maxclust',
找到两类之间最小距离小于t的进行合并  #http: //docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.fcluster.html#scipy.cluster.hierarchy.fcluster
  y_pred = hcluster.fclusterdata(spiral[: , 1 : 3], criterion = 'maxclust', t = 2)    plt.subplot(132)  plt.title(u 'use scipy to hierarchy cluster')  plt.scatter(spiral[: , 1], spiral[: , 2], c = y_pred)  #scikit进行聚类  plt.subplot(133)  plt.title(u 'use scikit to hierarchy cluster')  y_pred = AgglomerativeClustering(n_clusters = 2, linkage = 'ward').fit_predict(spiral[: , 1 : 3])    plt.scatter(spiral[: , 1], spiral[: , 2], c = y_pred)  plt.show() spiralSample()

来源: http://lib.csdn.net/article/python/44491

与本文相关文章

暂无,快来抢沙发吧！