- import pandas as pd
- # 显示所有行 (参数设置为 None 代表显示所有行, 也可以自行设置数字)
- pd.set_option('display.max_columns',None)
- # 显示所有列
- pd.set_option('display.max_rows',None)
- # 设置数据的显示长度, 默认为 50
- pd.set_option('max_colwidth',200)
- # 禁止自动换行 (设置为 Flase 不自动换行, True 反之)
- pd.set_option('expand_frame_repr', False)
- import numpy as np
- import re
- import sklearn
- import xgboost as xgb
- import seaborn as sns
- import matplotlib.pyplot as plt
- import plotly.offline as py
- py.init_notebook_mode(connected=True)
- import plotly.graph_objs as go
- import plotly.tools as tls
- import warnings
- warnings.filterwarnings('ignore')
- # Going to use these 5 base models for the stacking . 注意 stacking 的过程
- from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier,
- GradientBoostingClassifier, ExtraTreesClassifier)
- from sklearn.svm import SVC
- from sklearn.model_selection import KFold
- # Load in the train and test datasets
- train = pd.read_csv('/home/zwt/PycharmProjects/test/Machine_Learning/Titanic/train.csv')
- test = pd.read_csv('/home/zwt/PycharmProjects/test/Machine_Learning/Titanic/test.csv')
- # Store our Passenger ID for easy access
- PassengerId = test['PassengerId']
- train.head(3)
- full_data = [train,test]
- # 下面自己手动添加一下特征值到数据集里
- #Features1: 名字长度
- train['Name_length'] = train['Name'].apply(len)
- test['Name_length'] = test['Name'].apply(len)
- #Features2: 乘客是否有隔间
- #type(x) 为 float 则记为 1, 否则记为 0, 空值虽然显示为 NAN 但其的类型为 float, 如 type(train['Cabin'][0]) 输出为 float,type(train['Cabin'][1]) 输出为 str
- train['Has_cabin'] = train['Cabin'].apply(lambda x: 0 if type(x) == float else 1)
- test['Has_cabin'] = test['Cabin'].apply(lambda x: 0 if type(x) == float else 1)
- #Feature3: 家庭关系进阶版, 将原始数据中定义家庭关系的两个变量 SibSp 和 Parch 结合起来
- for dataset in full_data:
- dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
- #Feature4: 是否独自一人在船上, 根据家庭关系进阶版设置的特征
- for dataset in full_data:
- dataset['IsAlone'] = 0
- dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
- # 填补登船港未知的数据
- for dataset in full_data:
- dataset['Embarked'] = dataset['Embarked'].fillna('S')
- # 填补票价的空值并生成新特征 Feature5
- for dataset in full_data:
- dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())
- train['CategoricalFare'] = pd.qcut(train['Fare'], 4) #pd.qcut 是为了把票价分成一个个区间, 注意 qcut 和 cut 的区别
- # 对年龄空值进行填补
- for dataset in full_data:
- age_avg = dataset['Age'].mean()
- age_std = dataset['Age'].std()
- age_null_count = dataset['Age'].isnull().sum()
- age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size = age_null_count)
- dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list
- dataset['Age'] = dataset['Age'].astype(int)
- train['CategoricalAge'] = pd.cut(train['Age'], 5) #年龄分区
- # 定义函数对乘客 name 一栏中的称谓进行提取
- def get_title(name):
- title_search = re.search('([A-Za-z]+)\.', name)
- if title_search:
- return title_search.group(1)
- return ""
- #Feature5: 乘客称谓
- for dataset in full_data:
- dataset['Title'] = dataset['Name'].apply(get_title)
- for dataset in full_data:
- dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
- dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
- dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
- dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
- # 映射特征, 全部转换为数字变量
- for dataset in full_data:
- #mapping sex
- dataset['Sex'] = dataset['Sex'].map({'female':0, 'male':1}).astype(int)
- #mapping titles
- title_mapping = {'Mr':1 ,'Miss':2, 'Mrs':3, 'Master':4, 'Rare':5}
- dataset['Title'] = dataset['Title'].map(title_mapping)
- dataset['Title'] = dataset['Title'].fillna(0)
- # Mapping Embarked
- dataset['Embarked'] = dataset['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)
- # Mapping Fare
- dataset.loc[dataset['Fare'] <= 7.91, 'Fare'] = 0
- dataset.loc[(dataset['Fare']> 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
- dataset.loc[(dataset['Fare']> 14.454) & (dataset['Fare'] <= 31), 'Fare'] = 2
- dataset.loc[dataset['Fare']> 31, 'Fare'] = 3
- dataset['Fare'] = dataset['Fare'].astype(int)
- # Mapping Age
- dataset.loc[dataset['Age'] <= 16, 'Age'] = 0
- dataset.loc[(dataset['Age']> 16) & (dataset['Age'] <= 32), 'Age'] = 1
- dataset.loc[(dataset['Age']> 32) & (dataset['Age'] <= 48), 'Age'] = 2
- dataset.loc[(dataset['Age']> 48) & (dataset['Age'] <= 64), 'Age'] = 3
- dataset.loc[dataset['Age']> 64, 'Age'] = 4
- # 数据处理完毕, 选择特征
- drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']
- train = train.drop(drop_elements, axis = 1)
- train = train.drop(['CategoricalAge', 'CategoricalFare'], axis = 1)
- test = test.drop(drop_elements, axis = 1)
- # 画热力图看各个特征值之间的关联度
- colormap = plt.cm.RdBu
- plt.figure(figsize = (14,12))
- plt.title('Pearson Correlation of Features', y = 1.05, size = 15)
- sns.heatmap(train.astype(float).corr(), linewidths = 0.1, vmax = 1.0, square = True, cmap = colormap, linecolor = 'white', annot = True)
- # 结果显示, 除了 Family size 和 Parch 之间, 各特征之间没有较高的相关性, 对机器学习是有帮助的, 说明各特征之间具有较强的独立性, 包含更多的信息, 更少的信息冗余
- # 编写类来帮助后面我们调用不同的分类方法
- # Some useful parameters which will come in handy later on
- ntrain = train.shape[0]
- ntest = test.shape[0]
- SEED = 0 # for reproducibility
- NFOLDS = 5 # set folds for out-of-fold prediction
- kfold = KFold(n_splits = NFOLDS, random_state = SEED)
- kf = kfold.split(train)
- # Class to extend the Sklearn classifier
- class SklearnHelper(object):
- def __init__(self, clf, seed=0, params=None):
- params['random_state'] = seed
- self.clf = clf(**params)
- def train(self, x_train, y_train):
- self.clf.fit(x_train, y_train)
- def predict(self, x):
- return self.clf.predict(x)
- def fit(self, x, y):
- return self.clf.fit(x, y)
- def feature_importances(self, x, y):
- print(self.clf.fit(x, y).feature_importances_)
- # Class to extend XGboost classifer
- def get_oof(clf, x_train, y_train, x_test):
- oof_train = np.zeros((ntrain,))
- oof_test = np.zeros((ntest,))
- oof_test_skf = np.empty((NFOLDS, ntest))
- for i, (train_index, test_index) in enumerate(kf):
- x_tr = x_train[train_index]
- y_tr = y_train[train_index]
- x_te = x_train[test_index]
- clf.train(x_tr, y_tr)
- oof_train[test_index] = clf.predict(x_te)
- oof_test_skf[i, :] = clf.predict(x_test)
- oof_test[:] = oof_test_skf.mean(axis=0)
- return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)
- # Put in our parameters for said classifiers
- # Random Forest parameters
- rf_params = {
- 'n_jobs': -1,
- 'n_estimators': 500,
- 'warm_start': True,
- #'max_features': 0.2,
- 'max_depth': 6,
- 'min_samples_leaf': 2,
- 'max_features' : 'sqrt',
- 'verbose': 0
- }
- # Extra Trees Parameters
- et_params = {
- 'n_jobs': -1,
- 'n_estimators':500,
- #'max_features': 0.5,
- 'max_depth': 8,
- 'min_samples_leaf': 2,
- 'verbose': 0
- }
- # AdaBoost parameters
- ada_params = {
- 'n_estimators': 500,
- 'learning_rate' : 0.75
- }
- # Gradient Boosting parameters
- gb_params = {
- 'n_estimators': 500,
- #'max_features': 0.2,
- 'max_depth': 5,
- 'min_samples_leaf': 2,
- 'verbose': 0
- }
- # Support Vector Classifier parameters
- svc_params = {
- 'kernel' : 'linear',
- 'C' : 0.025
- }
- # Create 5 objects that represent our 4 models
- rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
- et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
- ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
- gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
- svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_params)
- # Create Numpy arrays of train, test and target ( Survived) dataframes to feed into our models
- y_train = train['Survived'].ravel()
- train = train.drop(['Survived'], axis=1)
- x_train = train.values # Creates an array of the train data
- x_test = test.values # Creats an array of the test data
- # Create our OOF train and test predictions. These base results will be used as new features
- et_oof_train, et_oof_test = get_oof(et, x_train, y_train, x_test) # Extra Trees
- rf_oof_train, rf_oof_test = get_oof(rf, x_train, y_train, x_test) # Random Forest
- ada_oof_train, ada_oof_test = get_oof(ada, x_train, y_train, x_test) # AdaBoost
- gb_oof_train, gb_oof_test = get_oof(gb, x_train, y_train, x_test) # Gradient Boost
- svc_oof_train, svc_oof_test = get_oof(svc, x_train, y_train, x_test) # Support Vector Classifier
- print("Training is complete")
- # 输出各个模型的性能特征
- rf_feature = rf.feature_importances(x_train,y_train)
- et_feature = et.feature_importances(x_train, y_train)
- ada_feature = ada.feature_importances(x_train, y_train)
- gb_feature = gb.feature_importances(x_train,y_train)
- # 手动将性能特征以列表形式复制给变量 (11 个数值分别对应 11 个特征的重要程度)
- rf_feature = [0.11074583, 0.24280623, 0.03293436, 0.02004395, 0.04902856, 0.02234903, 0.11151027, 0.06718905, 0.07099594, 0.01131659, 0.26108018]
- et_feature = [0.11860983, 0.37755262, 0.02632776, 0.01763431, 0.05555148, 0.02909049, 0.04819049, 0.0852282, 0.04603893, 0.02063341, 0.17514248]
- ada_feature = [0.03, 0.012, 0.014, 0.066, 0.04, 0.01, 0.688, 0.014, 0.056, 0, 0.07 ]
- gb_feature = [0.08952558, 0.01251838, 0.0507058, 0.01486893, 0.0519019, 0.02562565, 0.17077917, 0.03627126, 0.11332225, 0.00654679, 0.42793428]
- # 创建数据框以便画图
- cols = train.columns.values
- # Create a dataframe with features
- feature_dataframe = pd.DataFrame( {'features': cols,
- 'Random Forest feature importances': rf_feature,
- 'Extra Trees feature importances': et_feature,
- 'AdaBoost feature importances': ada_feature,
- 'Gradient Boost feature importances': gb_feature
- })
- # 画图
- # Scatter plot
- trace = go.Scatter(
- y = feature_dataframe['Random Forest feature importances'].values,
- x = feature_dataframe['features'].values,
- mode='markers',
- marker=dict(
- sizemode = 'diameter',
- sizeref = 1,
- size = 25,
- #size= feature_dataframe['AdaBoost feature importances'].values,
- #color = np.random.randn(500), #set color equal to a variable
- color = feature_dataframe['Random Forest feature importances'].values,
- colorscale='Portland',
- showscale=True
- ),
- text = feature_dataframe['features'].values
- )
- data = go.Data([trace])
- layout= go.Layout(
- autosize= True,
- title= 'Random Forest Feature Importance',
- hovermode= 'closest',
- # xaxis= dict(
- # title= 'Pop',
- # ticklen= 5,
- # zeroline= False,
- # gridwidth= 2,
- # ),
- yaxis=dict(
- title= 'Feature Importance',
- ticklen= 5,
- gridwidth= 2
- ),
- showlegend= False
- )
- fig = go.Figure(data=data, layout=layout)
- py.plot(fig,filename='scatter1')
- # Scatter plot
- trace = go.Scatter(
- y = feature_dataframe['Extra Trees feature importances'].values,
- x = feature_dataframe['features'].values,
- mode='markers',
- marker=dict(
- sizemode = 'diameter',
- sizeref = 1,
- size = 25,
- # size= feature_dataframe['AdaBoost feature importances'].values,
- #color = np.random.randn(500), #set color equal to a variable
- color = feature_dataframe['Extra Trees feature importances'].values,
- colorscale='Portland',
- showscale=True
- ),
- text = feature_dataframe['features'].values
- )
- data = [trace]
- layout= go.Layout(
- autosize= True,
- title= 'Extra Trees Feature Importance',
- hovermode= 'closest',
- # xaxis= dict(
- # title= 'Pop',
- # ticklen= 5,
- # zeroline= False,
- # gridwidth= 2,
- # ),
- yaxis=dict(
- title= 'Feature Importance',
- ticklen= 5,
- gridwidth= 2
- ),
- showlegend= False
- )
- fig = go.Figure(data=data, layout=layout)
- py.plot(fig,filename='scatter2')
- # Scatter plot
- trace = go.Scatter(
- y = feature_dataframe['AdaBoost feature importances'].values,
- x = feature_dataframe['features'].values,
- mode='markers',
- marker=dict(
- sizemode = 'diameter',
- sizeref = 1,
- size = 25,
- # size= feature_dataframe['AdaBoost feature importances'].values,
- #color = np.random.randn(500), #set color equal to a variable
- color = feature_dataframe['AdaBoost feature importances'].values,
- colorscale='Portland',
- showscale=True
- ),
- text = feature_dataframe['features'].values
- )
- data = [trace]
- layout= go.Layout(
- autosize= True,
- title= 'AdaBoost Feature Importance',
- hovermode= 'closest',
- # xaxis= dict(
- # title= 'Pop',
- # ticklen= 5,
- # zeroline= False,
- # gridwidth= 2,
- # ),
- yaxis=dict(
- title= 'Feature Importance',
- ticklen= 5,
- gridwidth= 2
- ),
- showlegend= False
- )
- fig = go.Figure(data=data, layout=layout)
- py.plot(fig,filename='scatter3')
- # Scatter plot
- trace = go.Scatter(
- y = feature_dataframe['Gradient Boost feature importances'].values,
- x = feature_dataframe['features'].values,
- mode='markers',
- marker=dict(
- sizemode = 'diameter',
- sizeref = 1,
- size = 25,
- # size= feature_dataframe['AdaBoost feature importances'].values,
- #color = np.random.randn(500), #set color equal to a variable
- color = feature_dataframe['Gradient Boost feature importances'].values,
- colorscale='Portland',
- showscale=True
- ),
- text = feature_dataframe['features'].values
- )
- data = [trace]
- layout= go.Layout(
- autosize= True,
- title= 'Gradient Boosting Feature Importance',
- hovermode= 'closest',
- # xaxis= dict(
- # title= 'Pop',
- # ticklen= 5,
- # zeroline= False,
- # gridwidth= 2,
- # ),
- yaxis=dict(
- title= 'Feature Importance',
- ticklen= 5,
- gridwidth= 2
- ),
- showlegend= False
- )
- fig = go.Figure(data=data, layout=layout)
- py.plot(fig,filename='scatter4')
- #Create the new column containing the average of values
- feature_dataframe['mean'] = feature_dataframe.mean(axis= 1) # axis = 1 computes the mean row-wise
- feature_dataframe.head(5)
- y = feature_dataframe['mean'].values
- x = feature_dataframe['features'].values
- data = [go.Bar(
- x= x,
- y= y,
- width = 0.5,
- marker=dict(
- color = feature_dataframe['mean'].values,
- colorscale='Portland',
- showscale=True,
- reversescale = False
- ),
- opacity=0.6
- )]
- layout= go.Layout(
- autosize= True,
- title= 'Barplots of Mean Feature Importance',
- hovermode= 'closest',
- # xaxis= dict(
- # title= 'Pop',
- # ticklen= 5,
- # zeroline= False,
- # gridwidth= 2,
- # ),
- yaxis=dict(
- title= 'Feature Importance',
- ticklen= 5,
- gridwidth= 2
- ),
- showlegend= False
- )
- fig = go.Figure(data=data, layout=layout)
- py.plot(fig, filename='bar-direct-labels')
- # 将第一次预测的结果作为新的变量, 加入到原数据中, 进行下一次分类
- base_predictions_train = pd.DataFrame( {'RandomForest': rf_oof_train.ravel(),
- 'ExtraTrees': et_oof_train.ravel(),
- 'AdaBoost': ada_oof_train.ravel(),
- 'GradientBoost': gb_oof_train.ravel()
- })
- base_predictions_train.head(20)
- # 热力图
- data = [
- go.Heatmap(
- z= base_predictions_train.astype(float).corr().values,
- x= base_predictions_train.columns.values,
- y= base_predictions_train.columns.values,
- colorscale='Viridis',
- showscale=True,
- reversescale = True
- )
- ]
- py.plot(data, filename='labelled-heatmap')
- View Code
- 8oao
来源: http://www.bubuko.com/infodetail-3073188.html