- 1 import pandas as pd
- 2 import numpy as np
- 3 s = pd.Series([1, 3, 6, np.nan, 44, 1])
- 4
- 5 df= pd.DataFrame(np.random.random((4,5)))
- 6
- 7 # data frame 常用属性
- 8 df.dtypes
- 9 df.index
- 10 df.columns
- 11 df.values
- 12
- 13 # data frame 常用方法
- 14 df.describe()
- 15 df.T
- 16 df.sort_index(axis = 1, ascending = False)
- 17 df.sort_values(by = 4)
- 18
- 19 # 选择数据
- 20 dates = pd.date_range('20160101', periods = 6)
- 21 df = pd.DataFrame(np.arange(24).reshape((6,4)), index = dates,
- 22 columns = ['A', 'B', 'C', 'D'])
- 23
- 24 '''row or column''' # 行不可隔着选择
- 25 print(df[0:3])
- 26 print(df[['A', 'D']])
- 27
- 28 '''select by label:loc''' # 行不可隔着选择
- 29 print(df.loc['20160101', :])
- 30 print(df.loc[:,['A', 'B']])
- 31
- 32 '''select by position:iloc'''
- 33 print(df.iloc[[0, 2], [0, 3]])
- 34
- 35 '''mixed selection:ix'''
- 36 print(df.ix[[0, 2], ['A', 'D']])
- 37
- 38 '''Boolean indexing'''
- 39 print(df[df.B > 5])
- 40
- 41 # 设置数据
- 42 df.iloc[2, 2] = 111
- 43 df.loc['20160101', 'D'] = 222
- 44 df.B[df.A > 5] = 0
- 45 print(df)
- 46
- 47 df['F'] = np.nan
- 48 df['E'] = range(6)
- 49 print(df)
- 50
- 51 # 处理缺失数据
- 52 df.iloc[0, 1] = np.nan
- 53 df.iloc[1, 2] = np.nan
- 54 print(df)
- 55 print(df.dropna(axis = 0, how = 'all')) # how = {'any', 'all'}
- 56 print(df.fillna(value = 0))
- 57 print(np.any(df.isnull()))
- 58
- 59 # data frame 合并
- 60 '''concatenating'''
- 61 df1 = pd.DataFrame(np.ones((3,4))*0, columns = ['a', 'b', 'c', 'd'])
- 62 df2 = pd.DataFrame(np.ones((3,4))*1, columns = ['a', 'b', 'c', 'd'])
- 63 df3 = pd.DataFrame(np.ones((3,4))*2, columns = ['a', 'b', 'c', 'd'])
- 64
- 65 res = pd.concat([df1, df2, df3], axis = 0, ignore_index = True)
- 66 res1 = pd.concat([df1, df2, df3], axis = 1)
- 67
- 68 '''join参数'''
- 69 df1 = pd.DataFrame(np.ones((3,4))*0, columns = ['a', 'b', 'c', 'd'], index = [1, 2, 3])
- 70 df2 = pd.DataFrame(np.ones((3,4))*1, columns = ['b', 'c', 'd', 'e'], index = [2, 3, 4])
- 71
- 72 res = pd.concat([df1, df2], join = 'outer', ignore_index = True)
- 73 res = pd.concat([df1, df2], join = 'inner', ignore_index = True)
- 74 print(res)
- 75
- 76 '''join_axes'''
- 77 res = pd.concat([df1, df2], axis = 1, join = 'inner')
- 78 res = pd.concat([df1, df2], axis = 1, join_axes = [df1.index])
- 79
- 80 # append
- 81 df1 = pd.DataFrame(np.ones((3,4))*0, columns = ['a', 'b', 'c', 'd'], index = [1, 2, 3])
- 82 df2 = pd.DataFrame(np.ones((3,4))*1, columns = ['b', 'c', 'd', 'e'], index = [2, 3, 4])
- 83 df3 = pd.DataFrame(np.ones((3,4))*1, columns = ['b', 'c', 'd', 'e'], index = [2, 3, 4])
- 84
- 85 res = df1.append([df2, df3], ignore_index = True)
- 86 res1 = pd.concat([df1, df2, df3])
- 87 print(res)
- 88 print(res1)
- 89
- 90 # data frame merge
- 91 '''merge one key'''
- 92 left = pd.DataFrame({'key':['K1','K2','K3'],
- 93 'A':[1,2,3],
- 94 'B':[4,5,6]})
- 95
- 96 right = pd.DataFrame({'key':['K0','K1','K3'],
- 97 'A':[11,43,53],
- 98 'D':[12,-1,0]})
- 99 res = pd.merge(left, right, on = 'key', how = 'outer')
- 100 print(res)
- 101
- 102 '''merge two or more keys'''
- 103 left = pd.DataFrame({'key0':['K1','K2','K3'],
- 104 'key1':['X0','X2','X3'],
- 105 'A':[1,2,3],
- 106 'B':[4,5,6]})
- 107
- 108 right = pd.DataFrame({'key0':['K0','K1','K3'],
- 109 'key1':['X1','X0','K3'],
- 110 'A':[11,43,53],
- 111 'D':[12,-1,0]})
- 112 res = pd.merge(left, right, on = ['key0', 'key1'], how = 'outer')
- 113 print(res)
- 114
- 115 '''merge index'''
- 116 left = pd.DataFrame({'A':[1,2,3],
- 117 'B':[4,5,6]},
- 118 index = ['K0', 'K1', 'K2'])
- 119
- 120 right = pd.DataFrame({'A':[11,43,53],
- 121 'D':[12,-1,0]},
- 122 index = ['K1', 'K2', 'K3'])
- 123 res = pd.merge(left, right, left_index = True,
- 124 right_index = True)
- 125 print(res)
- 126
- 127 '''handle overlapping columns'''
- 128 left = pd.DataFrame({'key':['K1','K2','K3'],
- 129 'A':[1,2,3],
- 130 'B':[4,5,6]})
- 131
- 132 right = pd.DataFrame({'key':['K0','K1','K3'],
- 133 'A':[11,43,53],
- 134 'B':[12,-1,0]})
- 135 res = pd.merge(left, right, on = 'key',
- 136 suffixes = ['_left', '_right'] , how = 'outer')
- 137 print(res)
- 138
- 139 # 作图
- 140 import pandas as pd
- 141 import numpy as np
- 142 import matplotlib.pyplot as plt
- 143
- 144 '''plot data'''
- 145 '''Series'''
- 146 data = pd.Series(np.random.randn(1000), index = np.arange(1000))
- 147 data = data.cumsum()
- 148 data.plot()
- 149 print(data)
- 150
- 151 '''Data Frame'''
- 152 data = pd.DataFrame(np.random.randn(1000, 4),
- 153 index = np.arange(1000),
- 154 columns = list("ABCD"))
- 155 print(data.head())
- 156 data = data.cumsum()
- 157 data.plot()
- 158 ax = data.plot.scatter(x = 'A', y = 'C',
- 159 color = 'Red',
- 160 label = 'Class 2')
- 161 data.plot.scatter(x = 'A', y = 'B',
- 162 color = 'DarkGreen',
- 163 label = 'Class 2',
- 164 ax = ax)
来源: