- '''
- @author: Garvin
- '''from numpy import * import matplotlib.pyplot as plt def loadDataSet(fileName, delim = '\t') : fr = open(fileName) stringArr = [line.strip().split(delim) for line in fr.readlines()] datArr = [map(float, line) for line in stringArr]
- return mat(datArr) def pca(dataMat, topNfeat = 9999999) : meanVals = mean(dataMat, axis = 0) meanRemoved = dataMat - meanVals#remove mean covMat = cov(meanRemoved, rowvar = 0) eigVals,
- eigVects = linalg.eig(mat(covMat)) eigValInd = argsort(eigVals)#sort,
- sort goes smallest to largest eigValInd = eigValInd[: -(topNfeat + 1) : -1]#cut off unwanted dimensions redEigVects = eigVects[: , eigValInd]#reorganize eig vects largest to smallest lowDDataMat = meanRemoved * redEigVects#transform data into new dimensions reconMat = (lowDDataMat * redEigVects.T) + meanVals
- return lowDDataMat,
- reconMat def plotBestFit(dataSet1, dataSet2) : dataArr1 = array(dataSet1) dataArr2 = array(dataSet2) n = shape(dataArr1)[0] n1 = shape(dataArr2)[0] xcord1 = [];
- ycord1 = [] xcord2 = [];
- ycord2 = [] xcord3 = [];
- ycord3 = [] j = 0
- for i in range(n) :
- xcord1.append(dataArr1[i, 0]);
- ycord1.append(dataArr1[i, 1]) xcord2.append(dataArr2[i, 0]);
- ycord2.append(dataArr2[i, 1]) fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(xcord1, ycord1, s = 30, c = 'red', marker = 's') ax.scatter(xcord2, ycord2, s = 30, c = 'green')
- plt.xlabel('X1');
- plt.ylabel('X2');
- plt.show() if __name__ == '__main__': mata = loadDataSet('/Users/hakuri/Desktop/testSet.txt') a,
- b = pca(mata, 2)
来源: http://lib.csdn.net/snippet/machinelearning/44495