PCA 主成分分析, 无监督学习降维方法:
- import matplotlib.pyplot as plt
- import matplotlib.image as maping
- import matplotlib
- import numpy as np
- import seaborn as sns
- import pandas as pds
- import plotly.graph_objs as go
- import plotly.tools as tls
- %matplotlib inline
- from sklearn.manifold import TSNE
- from sklearn.decomposition import PCA
- from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
使用 sklearn 方式展现 PCA
- train=pds.read_csv("train.csv")
- train.head()
label | pixel0 | pixel1 | pixel2 | pixel3 | pixel4 | pixel5 | pixel6 | pixel7 | pixel8 | pixel9 | pixel10 | pixel11 | pixel12 | pixel13 | pixel14 | pixel15 | pixel16 | pixel17 | pixel18 | pixel19 | pixel20 | pixel21 | pixel22 | pixel23 | pixel24 | pixel25 | pixel26 | pixel27 | pixel28 | pixel29 | pixel30 | pixel31 | pixel32 | pixel33 | pixel34 | pixel35 | pixel36 | pixel37 | pixel38 | pixel39 | pixel40 | pixel41 | pixel42 | pixel43 | pixel44 | pixel45 | pixel46 | pixel47 | pixel48 | pixel49 | pixel50 | pixel51 | pixel52 | pixel53 | pixel54 | pixel55 | pixel56 | pixel57 | pixel58 | ... | pixel724 | pixel725 | pixel726 | pixel727 | pixel728 | pixel729 | pixel730 | pixel731 | pixel732 | pixel733 | pixel734 | pixel735 | pixel736 | pixel737 | pixel738 | pixel739 | pixel740 | pixel741 | pixel742 | pixel743 | pixel744 | pixel745 | pixel746 | pixel747 | pixel748 | pixel749 | pixel750 | pixel751 | pixel752 | pixel753 | pixel754 | pixel755 | pixel756 | pixel757 | pixel758 | pixel759 | pixel760 | pixel761 | pixel762 | pixel763 | pixel764 | pixel765 | pixel766 | pixel767 | pixel768 | pixel769 | pixel770 | pixel771 | pixel772 | pixel773 | pixel774 | pixel775 | pixel776 | pixel777 | pixel778 | pixel779 | pixel780 | pixel781 | pixel782 | pixel783 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
- print(train.shape)
- (42000, 785)
- target=train["label"]
- train=train.drop("label",axis=1)
- train.head()
pixel0 | pixel1 | pixel2 | pixel3 | pixel4 | pixel5 | pixel6 | pixel7 | pixel8 | pixel9 | pixel10 | pixel11 | pixel12 | pixel13 | pixel14 | pixel15 | pixel16 | pixel17 | pixel18 | pixel19 | pixel20 | pixel21 | pixel22 | pixel23 | pixel24 | pixel25 | pixel26 | pixel27 | pixel28 | pixel29 | pixel30 | pixel31 | pixel32 | pixel33 | pixel34 | pixel35 | pixel36 | pixel37 | pixel38 | pixel39 | pixel40 | pixel41 | pixel42 | pixel43 | pixel44 | pixel45 | pixel46 | pixel47 | pixel48 | pixel49 | pixel50 | pixel51 | pixel52 | pixel53 | pixel54 | pixel55 | pixel56 | pixel57 | pixel58 | pixel59 | ... | pixel724 | pixel725 | pixel726 | pixel727 | pixel728 | pixel729 | pixel730 | pixel731 | pixel732 | pixel733 | pixel734 | pixel735 | pixel736 | pixel737 | pixel738 | pixel739 | pixel740 | pixel741 | pixel742 | pixel743 | pixel744 | pixel745 | pixel746 | pixel747 | pixel748 | pixel749 | pixel750 | pixel751 | pixel752 | pixel753 | pixel754 | pixel755 | pixel756 | pixel757 | pixel758 | pixel759 | pixel760 | pixel761 | pixel762 | pixel763 | pixel764 | pixel765 | pixel766 | pixel767 | pixel768 | pixel769 | pixel770 | pixel771 | pixel772 | pixel773 | pixel774 | pixel775 | pixel776 | pixel777 | pixel778 | pixel779 | pixel780 | pixel781 | pixel782 | pixel783 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
- # 数据标准化
- from sklearn.preprocessing import StandardScaler
- X=train.values
- transform=StandardScaler()
- X_std=transform.fit_transform(X)
- c:\users\lenovo\appdata\local\programs\python\python36\lib\site-packages\sklearn\utils\validation.py:595: DataConversionWarning:
- Data with input dtype int64 was converted to float64 by StandardScaler.
- c:\users\lenovo\appdata\local\programs\python\python36\lib\site-packages\sklearn\utils\validation.py:595: DataConversionWarning:
- Data with input dtype int64 was converted to float64 by StandardScaler.
- # 特征向量和特征值
- mean_vec=np.mean(X_std,axis=0)
- cov_mat=np.cov(X_std.T)
- eig_vals,eig_vecs=np.linalg.eig(cov_mat)
- # 创建特征向量和特征值的元组
- eig_pairs=[(np.abs(eig_vals[i]),eig_vecs[:,i] for i in range(len(eig_vals))]
- # 对特征向量和特征值进行排序
- eig_pairs.sort(key=lambda x : x[0],reverse=True)
- # 计算累计解释方差
- tot=sum(eig_vals)
- var_exp=[(i/tot)*100 for i in sorted(eig_vals,reverse=True)]
- cum_var_exp=np.cumsum(var_exp)
- # 接下来使用 Ploty 可视化显示
- trace1=go.Scatter(x=list(range(784)),y=cum_var_exp,mode="lines+markers",name="'累计解释方差'",line=dict(shape='spline',color='goldenrod'))
- trace2=go.Scatter(x=list(range(784)),y=var_exp,mode="lines+markers",name="'单个解释方差'",line=dict(shape='linear',color='black'))
- fig=tls.make_subplots(insets=[{'cell':(1,1),'l':0.7,'b':0.5}],print_grid=True)
- fig.append_trace(trace1,1,1)
- fig.append_trace(trace2,1,1)
- fig.layout.title="解释方差"
- fig.layout.xaxis=dict(range=[0,80],title="特征列")
- fig.layout.yaxis=dict(range=[0,60],title="解释变量")
- This is the format of your plot grid:
- [ (1,1) x1,y1 ]
- With insets:
- [ x2,y2 ] over [ (1,1) x1,y1 ]
- # 可视化特征值
- n_components=30
- pca=PCA(n_components=n_components).fit(train.values)
- eigenvalues=pca.components_.reshape(n_components,28,28)
- eigenvalues=pca.components_
- n_row=4
- n_col=7
- # 显示前 8 个特征值
- plt.figure(figsize=(12,13))
- for i in list(range(n_row*n_col)):
- offset=0
- plt.subplot(n_row,n_col,i+1)
- plt.imshow(eigenvalues[i].reshape(28,28),cmap='jet')
- title_text="Eigenvalue"+str(i+1)
- plt.title(title_text,size=6.5)
- plt.xticks(())
- plt.yticks(())
- plt.show()
- plt.figure(figsize=(14,12))
- for digit_num in range(0,70):
- plt.subplot(7,10,digit_num+1)
- grid_data=train.iloc[digit_num].as_matrix().reshape(28,28)
- plt.imshow(grid_data,interpolation="none",cmap="afmhot")
- plt.xticks([])
- plt.yticks([])
- plt.tight_layout()
- #PCA 使用在 SK-learn 中
- del X
- X=train[:6000].values
- del train
- X_std=StandardScaler().fit_transform(X)
- pca=PCA(n_components=5)
- pca.fit(X_std)
- X_5d=pca.transform(X_std)
- # 使用散点图显示 PCA 效果
- import plotly.offline as py
- py.offline.init_notebook_mode(connected=True)
- Target=target[:6000]
- trace0=go.Scatter(x=X_5d[:,0],y=X_5d[:,1],mode="markers",text=Target,showlegend=False,marker=dict(size=8,color=Target,colorscale="Jet",showscale=False,line=dict(width=2,color="rgb(255,255,255)"),opacity=0.8))
- data=[trace0]
- layout=go.Layout(title="PCA",hovermode="closest",xaxis=dict(title="First Principal Component",ticklen=5,zeroline=False,gridwidth=2,),yaxis=dict(title="Second Principal Component",ticklen=5,gridwidth=2,),showlegend=True)
- fig=dict(data=data,layout=layout)
- py.iplot(fig,filename="style-scatter")
- from sklearn.cluster import KMeans
- kmeans=KMeans(n_clusters=9)
- X_clustered=kmeans.fit_predict(X_5d)
- trace_Kmeans=go.Scatter(x=X_5d[:,0],y=X_5d[:,1],mode="markers",showlegend=False,marker=dict(size=8,color=X_clustered,colorscale="Portland",showscale=False,line=dict(width=2,color='rgb(255,255,255)')))
- layout=go.Layout(title="K-Means",hovermode="closest",xaxis=dict(title="First Principal Component",ticklen=5,zeroline=False,gridwidth=2,),yaxis=dict(title="Second Principal Component",ticklen=5,gridwidth=2,),showlegend=True)
- data=[trace_Kmeans]
- fig1=dict(data=data,layout=layout)
- py.iplot(fig1,filename="svm")
PCA 降维
来源: http://www.bubuko.com/infodetail-2967775.html