1 Python 环境安装
shift + Enter : 换行
ctrl + Enter : 执行
2 Python IDE 环境安装
3 数据预处理
头几行展示
- import numpy as np
- import pandas as pd
- import matplotlib.pyplot as plt
- %matplotlib inline
- from sklearn.ensemble import RandomForestClassifier
- from sklearn.cross_validation import KFold
- # import data
- filename= "C:\\ML\\MLData\\data.csv"
- raw = pd.read_csv(filename)
- print (raw.shape)
- raw.head()
尾几行展示
去除空值
matplot 列属性绘制分布
- #plt.subplot(211) first is raw second Column
- # 透明程度 (颜色深度和密度)
- alpha = 0.02
- # 指定图大概占用的区域
- plt.figure(figsize=(10,10))
- # loc_x and loc_y(一行两列第一个位置)
- plt.subplot(121)
- # scatter 散点图
- plt.scatter(kobe.loc_x, kobe.loc_y, color='R', alpha=alpha)
- plt.title('loc_x and loc_y')
- # lat and lon(一行两列第二个位置)
- plt.subplot(122)
- plt.scatter(kobe.lon, kobe.lat, color='B', alpha=alpha)
- plt.title('lat and lon')
角度和极坐标预处理
- raw['dist'] = np.sqrt(raw['loc_x']**2 + raw['loc_y']**2)
- loc_x_zero = raw['loc_x'] == 0
- #print (loc_x_zero)
- raw['angle'] = np.array([0]*len(raw))
- raw['angle'][~loc_x_zero] = np.arctan(raw['loc_y'][~loc_x_zero] / raw['loc_x'][~loc_x_zero])
- raw['angle'][loc_x_zero] = np.pi / 2
时间处理
raw['remaining_time'] = raw['minutes_remaining'] * 60 + raw['seconds_remaining']
属性唯一值及分组统计打印出来
投篮方式
- print(kobe.action_type.unique())
- print(kobe.combined_shot_type.unique())
- print(kobe.shot_type.unique())
分组统计
print(kobe.shot_type.value_counts())
按列进行特殊符号处理
- kobe['season'].unique()
- array(['2000-01', '2001-02', '2002-03', '2003-04', '2004-05', '2005-06',
- '2006-07', '2007-08', '2008-09', '2009-10', '2010-11', '2011-12',
- '2012-13', '2013-14', '2014-15', '2015-16', '1996-97', '1997-98',
- '1998-99', '1999-00'], dtype=object)
- raw['season'] = raw['season'].apply(lambda x: int(x.split('-')[1]) )
- raw['season'].unique()
- array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 97,
- 98, 99, 0], dtype=int64)
pd 的 DataFrame 使用技巧(matchup 两队对决, opponent 对手是谁)
pd.DataFrame({'matchup':kobe.matchup, 'opponent':kobe.opponent})
- plt.figure(figsize=(5,5))
- plt.scatter(raw.dist, raw.shot_distance, color='blue')
- plt.title('dist and shot_distance')
- gs = kobe.groupby('shot_zone_area')
- print (kobe['shot_zone_area'].value_counts())
- print (len(gs))
- Center(C) 11289
- Right Side Center(RC) 3981
- Right Side(R) 3859
- Left Side Center(LC) 3364
- Left Side(L) 3132
- Back Court(BC) 72
- Name: shot_zone_area, dtype: int64
- 6
- import matplotlib.cm as cm
- plt.figure(figsize=(20,10))
- def scatter_plot_by_category(feat):
- alpha = 0.1
- gs = kobe.groupby(feat)
- cs = cm.rainbow(np.linspace(0, 1, len(gs)))
- for g, c in zip(gs, cs):
- plt.scatter(g[1].loc_x, g[1].loc_y, color=c, alpha=alpha)
- # shot_zone_area
- plt.subplot(131)
- scatter_plot_by_category('shot_zone_area')
- plt.title('shot_zone_area')
- # shot_zone_basic
- plt.subplot(132)
- scatter_plot_by_category('shot_zone_basic')
- plt.title('shot_zone_basic')
- # shot_zone_range
- plt.subplot(133)
- scatter_plot_by_category('shot_zone_range')
- plt.title('shot_zone_range')
- drops = ['shot_id', 'team_id', 'team_name', 'shot_zone_area', 'shot_zone_range', 'shot_zone_basic', \
- 'matchup', 'lon', 'lat', 'seconds_remaining', 'minutes_remaining', \
- 'shot_distance', 'loc_x', 'loc_y', 'game_event_id', 'game_id', 'game_date']
- for drop in drops:
- raw = raw.drop(drop, 1)
- print (raw['combined_shot_type'].value_counts())
- pd.get_dummies(raw['combined_shot_type'], prefix='combined_shot_type')[0:2]
- Jump Shot 23485
- Layup 5448
- Dunk 1286
- Tip Shot 184
- Hook Shot 153
- Bank Shot 141
- Name: combined_shot_type, dtype: int64
- categorical_vars = ['action_type', 'combined_shot_type', 'shot_type', 'opponent', 'period', 'season']
- for var in categorical_vars:
- raw = pd.concat([raw, pd.get_dummies(raw[var], prefix=var)], 1)
- raw = raw.drop(var, 1)
- train_kobe = raw[pd.notnull(raw['shot_made_flag'])]
- train_kobe = train_kobe.drop('shot_made_flag', 1)
- train_label = train_kobe['shot_made_flag']
- test_kobe = raw[pd.isnull(raw['shot_made_flag'])]
- test_kobe = test_kobe.drop('shot_made_flag', 1)
- from sklearn.ensemble import RandomForestRegressor
- from sklearn.metrics import confusion_matrix,log_loss
- import time
- # find the best n_estimators for RandomForestClassifier
- print('Finding best n_estimators for RandomForestClassifier...')
- min_score = 100000
- best_n = 0
- scores_n = []
- range_n = np.logspace(0,2,num=3).astype(int)
- for n in range_n:
- print("the number of trees : {0}".format(n))
- t1 = time.time()
- rfc_score = 0.
- rfc = RandomForestClassifier(n_estimators=n)
- for train_k, test_k in KFold(len(train_kobe), n_folds=10, shuffle=True):
- rfc.fit(train_kobe.iloc[train_k], train_label.iloc[train_k])
- #rfc_score += rfc.score(train.iloc[test_k], train_y.iloc[test_k])/10
- pred = rfc.predict(train_kobe.iloc[test_k])
- rfc_score += log_loss(train_label.iloc[test_k], pred) / 10
- scores_n.append(rfc_score)
- if rfc_score < min_score:
- min_score = rfc_score
- best_n = n
- t2 = time.time()
- print('Done processing {0} trees ({1:.3f}sec)'.format(n, t2-t1))
- print(best_n, min_score)
- # find best max_depth for RandomForestClassifier
- print('Finding best max_depth for RandomForestClassifier...')
- min_score = 100000
- best_m = 0
- scores_m = []
- range_m = np.logspace(0,2,num=3).astype(int)
- for m in range_m:
- print("the max depth : {0}".format(m))
- t1 = time.time()
- rfc_score = 0.
- rfc = RandomForestClassifier(max_depth=m, n_estimators=best_n)
- for train_k, test_k in KFold(len(train_kobe), n_folds=10, shuffle=True):
- rfc.fit(train_kobe.iloc[train_k], train_label.iloc[train_k])
- #rfc_score += rfc.score(train.iloc[test_k], train_y.iloc[test_k])/10
- pred = rfc.predict(train_kobe.iloc[test_k])
- rfc_score += log_loss(train_label.iloc[test_k], pred) / 10
- scores_m.append(rfc_score)
- if rfc_score < min_score:
- min_score = rfc_score
- best_m = m
- t2 = time.time()
- print('Done processing {0} trees ({1:.3f}sec)'.format(m, t2-t1))
- print(best_m, min_score)
- plt.figure(figsize=(10,5))
- plt.subplot(121)
- plt.plot(range_n, scores_n)
- plt.ylabel('score')
- plt.xlabel('number of trees')
- plt.subplot(122)
- plt.plot(range_m, scores_m)
- plt.ylabel('score')
- plt.xlabel('max depth')
- model = RandomForestClassifier(n_estimators=best_n, max_depth=best_m)
- model.fit(train_kobe, train_label)
- # 474241623
来源: https://juejin.im/post/5c0b56a8e51d45145b0dbdf5