- # Uncomment one of the following lines and run the cell:
- df = pd.read_csv("redcard.csv.gz", compression='gzip')
- df.shape
- (146028, 28)
- df.head()
- df.dtypes
- playerShort object
- player object
- club object
- leagueCountry object
- birthday object
- height float64
- weight float64
- position object
- games int64
- victories int64
- ties int64
- defeats int64
- goals int64
- yellowCards int64
- yellowReds int64
- redCards int64
- photoID object
- rater1 float64
- rater2 float64
- refNum int64
- refCountry int64
- Alpha_3 object
- meanIAT float64
- nIAT float64
- seIAT float64
- meanExp float64
- nExp float64
- seExp float64
- dtype: object
- all_columns = df.columns.tolist()
- all_columns
- ['playerShort',
- 'player',
- 'club',
- 'leagueCountry',
- 'birthday',
- 'height',
- 'weight',
- 'position',
- 'games',
- 'victories',
- 'ties',
- 'defeats',
- 'goals',
- 'yellowCards',
- 'yellowReds',
- 'redCards',
- 'photoID',
- 'rater1',
- 'rater2',
- 'refNum',
- 'refCountry',
- 'Alpha_3',
- 'meanIAT',
- 'nIAT',
- 'seIAT',
- 'meanExp',
- 'nExp',
- 'seExp']
- df['height'].mean()
- 181.93593798236887
- df['height'].mean()
- 181.93593798236887
- np.mean(df.groupby('playerShort').height.mean())
- 181.74372848007872
- Tidy Data
- df2 = pd.DataFrame({'key1':['a', 'a', 'b', 'b', 'a'],
- 'key2':['one', 'two', 'one', 'two', 'one'],
- 'data1':np.random.randn(5),
- 'data2':np.random.randn(5)})
- grouped = df2['data1'].groupby(df['key1'])
- grouped.mean()
- key1
- a -0.093686
- b -0.322711
- Name: data1, dtype: float64
- player_index = 'playerShort'
- player_cols = [#'player', # drop player name, we have unique identifier
- 'birthday',
- 'height',
- 'weight',
- 'position',
- 'photoID',
- 'rater1',
- 'rater2',
- ]
- all_cols_unique_players = df.groupby('').agg({col:'nunique' for col in player_cols})
- all_cols_unique_players.head()
- all_cols_unique_players[all_cols_unique_players> 1].dropna().shape[0] == 0
- True
- def get_subgroup(dataframe, g_index, g_columns):
- g = dataframe.groupby(g_index).agg({col:'nunique' for col in g_columns})
- if g[g> 1].dropna().shape[0] != 0:
- print("Warning: you probably assumed this had all unique values but it doesn't.")
- return dataframe.groupby(g_index).agg({col:'max' for col in g_columns})
- players = get_subgroup(df, player_index, player_cols)
- players.head()
来源: https://juejin.im/post/5c0e9b306fb9a049c643aecc