一, 加载数据
- import pandas as pd
- import numpy as np
- url = ('https://raw.github.com/pandas-dev/pandas/master/pandas/tests/data/tips.csv')
- tips = pd.read_csv(url)
- output = tips.head()
- Output:
- total_bill tip sex smoker day time size
- 0 16.99 1.01 Female No Sun Dinner 2
- 1 10.34 1.66 Male No Sun Dinner 3
- 2 21.01 3.50 Male No Sun Dinner 3
- 3 23.68 3.31 Male No Sun Dinner 2
- 4 24.59 3.61 Female No Sun Dinner 4
二, SELECT 的使用方式
sql 语句: SELECT total_bill, tip, smoker, time FROM tips LIMIT 5;.
- output = tips[['total_bill', 'tip', 'smoker', 'time']].head(5)
- Output:
- total_bill tip smoker time
- 0 16.99 1.01 No Dinner
- 1 10.34 1.66 No Dinner
- 2 21.01 3.50 No Dinner
- 3 23.68 3.31 No Dinner
- 4 24.59 3.61 No Dinner
三, WHERE 的使用方式
1. 举个栗子
sql 语句: SELECT * FROM tips WHERE time = 'Dinner' LIMIT 5;
- output = tips[tips['time'] == 'Dinner'].head(5)
- # 或者
- output = tips.query("time =='Dinner'").head(5)
- Output:
- total_bill tip sex smoker day time size
- 0 16.99 1.01 Female No Sun Dinner 2
- 1 10.34 1.66 Male No Sun Dinner 3
- 2 21.01 3.50 Male No Sun Dinner 3
- 3 23.68 3.31 Male No Sun Dinner 2
- 4 24.59 3.61 Female No Sun Dinner 4
2. 比较运算符: 等于 ==, 大于 >, 大于等于 >=, 小于等于 <=, 不等于 !=
2.1 等于 ==
sql 语句: SELECT * FROM tips WHERE time = 'Dinner';.
output = tips[(tips['time'] == 'Dinner')]
2.2 大于 >
sql 语句: SELECT * FROM tips WHERE tip> 5.00;.
output = tips[(tips['tip']> 5.00)]
2.3 大于等于 >=
sql 语句: SELECT * FROM tips WHERE tip>= 5.00;.
output = tips[(tips['size']>= 5)]
2.4 小于等于 <=
sql 语句: SELECT * FROM tips WHERE tip <= 5.00;.
output = tips[(tips['size'] <= 5)]
2.5 不等于 !=
sql 语句: SELECT * FROM tips WHERE tip <> 5.00;.
output = tips[(tips['size'] != 5)]
3. 逻辑运算符: 且 &, 或 |, 非 -
3.1 且 &
sql 语句: SELECT * FROM tips WHERE time = 'Dinner' AND tip> 5.00;
output = tips[(tips['time'] == 'Dinner') & (tips['tip']> 5.00)]
3.2 或 |
sql 语句: SELECT * FROM tips WHERE size>= 5 OR total_bill> 45;.
output = tips[(tips['size']>= 5) | (tips['total_bill']> 45)]
3.3 非 -
sql 语句: SELECT * FROM tips WHERE not (size <> 5 AND size> 4);
output = df[-((df['size'] != 5) & (df['size']> 4))]
4. Null 的判断
这里重新定义一个包含 NaN 数据的 DataFrame.
- frame = pd.DataFrame({
- 'col1': ['A', 'B', np.NaN, 'C', 'D'],
- 'col2': ['F', np.NaN, 'G', 'H', 'I']
- })
- output = frame
- Output:
- col1 col2
- 0 A F
- 1 B NaN
- 2 NaN G
- 3 C H
- 4 D I
4.1 判断列是 Null
sql 语句: SELECT * FROM frame WHERE col2 IS NULL;.
- output = frame[frame['col2'].isna()]
- Output:
- col1 col2
- 1 B NaN
4.2 判断列不是 Null
sql 语句: SELECT * FROM frame WHERE col1 IS NOT NULL;.
- output = frame[frame['col1'].notna()]
- Output:
- col1 col2
- 0 A F
- 1 B NaN
- 3 C H
- 4 D I
5. In,Like 操作
5.1 In
sql 语句: SELECT * FROM tips WHERE siez in (5, 6);.
- output = tips[tips['size'].isin([2, 5])]
- 5.2 Like
sql 语句: SELECT * FROM tips WHERE time like 'Din%';.
output = tips[tips.time.str.contains('Din*')]
四, GROUP BY 的使用方式
sql 语句: SELECT sex, count(*) FROM tips GROUP BY sex;
- output = tips.groupby('sex').size()
- # 获取相应的结果
- output['Male']
- output['Female']
- output = tips.groupby('sex').count()
- # 获取相应的结果
- output['tip']['Female']
- output = tips.groupby('sex')['total_bill'].count()
- # 获取相应的结果
- output['Male']
- output['Female']
sql 语句: SELECT day, AVG(tip), COUNT(*) FROM tips GROUP BY day;
- output = tips.groupby('day').agg({
- 'tip': np.mean, 'day': np.size
- })
- # 获取相应的结果
- output['day']['Fri']
- output['tip']['Fri']
sql 语句: SELECT smoker, day, COUNT(*), AVG(tip) FROM tips GROUP BY smoker, day;
- output = tips.groupby(['smoker', 'day']).agg({
- 'tip': [np.size, np.mean]
- })
- # 获取相应的结果
- output['tip']['size']['No']['Fri']
sql 语句: SELECT tip, count(distinct sex) FROM tips GROUP BY tip;
output = tips.groupby('tip').agg({'sex': pd.Series.nunique})
五, JOIN 连接的使用方式
定义两个 DataFrame.
- df1 = pd.DataFrame({
- 'key': ['A', 'B', 'C', 'D'], 'value': np.random.randn(4)
- })
- df2 = pd.DataFrame({
- 'key': ['B', 'D', 'D', 'E'], 'value': np.random.randn(4)
- })
1. 内连接 Inner Join
sql 语句: SELECT * FROM df1 INNER JOIN df2 ON df1.key = df2.key;
- output = pd.merge(df1, df2, on='key')
- # 或
- indexed_df2 = df2.set_index('key')
- pd.merge(df1, indexed_df2, left_on='key', right_index=True)
2. 左连接 Left Outer Join
sql 语句: SELECT * FROM df1 LEFT OUTER JOIN df2 ON df1.key = df2.key;
- output = pd.merge(df1, df2, on='key', how='left')
- # 或
- output = df1.join(df2, on='key', how='left')
3. 右连接 Right Join
sql 语句: SELECT * FROM df1 RIGHT OUTER JOIN df2 ON df1.key = df2.key;
output = pd.merge(df1, df2, on='key', how='right')
4. 全连接 Full Join
sql 语句: SELECT * FROM df1 FULL OUTER JOIN df2 ON df1.key = df2.key;
output = pd.merge(df1, df2, on='key', how='outer')
五, UNION 的使用方式
- df1 = pd.DataFrame({
- 'city': ['Chicago', 'San Francisco', 'New York City'], 'rank': range(1, 4)
- })
- df2 = pd.DataFrame({
- 'city': ['Chicago', 'Boston', 'Los Angeles'], 'rank': [1, 4, 5]
- })
sql 语句: SELECT city, rank FROM df1 UNION ALL SELECT city, rank FROM df2;
output = pd.concat([df1, df2])
sql 语句: SELECT city, rank FROM df1 UNION SELECT city, rank FROM df2;
output = pd.concat([df1, df2]).drop_duplicates()
六, 与 SQL 等价的其他语法
1. 去重 Distinct
sql 语句: SELECT DISTINCT sex FROM tips;
output = tips.drop_duplicates(subset=['sex'], keep='first', inplace=False)
2. 修改列别名 As
sql 语句: SELECT total_bill AS total, sex AS xes FROM tips;
output = tips.rename(columns={'total_bill': 'total', 'sex': 'xes'}, inplace=False)
3. Limit 与 Offset
sql 语句: SELECT * FROM tips ORDER BY tip DESC LIMIT 10 OFFSET 5;
output = tips.nlargest(10 + 5, columns='tip').tail(10)
4. 每个 Group 的前几行
sql 语句:
- SELECT * FROM (
- SELECT
- t.*,
- ROW_NUMBER() OVER(PARTITION BY day ORDER BY total_bill DESC) AS rn
- FROM tips t
- )
- WHERE rn <3
- ORDER BY day, rn;
- output = tips.assign(rn=tips.sort_values(['total_bill'], ascending=False). groupby(['day']).cumcount() + 1). query('rn < 3'). sort_values(['day', 'rn'])
七, Update 的使用方式
sql 语句: UPDATE tips SET tip = tip*2 WHERE tip < 2;
output = tips.loc[tips['tip'] < 2, 'tip'] *= 2
八, Delete 的使用方式
sql 语句: DELETE FROM tips WHERE tip> 9;
output = tips = tips.loc[tips['tip'] <= 9]
九, 参考文章
[Python 实战] Pandas: 让你像写 SQL 一样做数据分析 (一) https://www.cnblogs.com/en-heng/p/5630849.html
Comparison with SQL
来源: http://www.bubuko.com/infodetail-3092726.html