requests 模块
用法
- import requests
- # 1. 方法
- """
- requests.get
- requests.post
- requests.put
- requests.delete
- ...
- requests.request(method='POST')
- """
- # 2. 参数
- """
- 2.1 url
- 2.2 headers
- 2.3 cookies
- 2.4 params
- 2.5 data, 传请求体
- requests.post(
- ...,
- data={'user':'alex','pwd':'123'}
- )
- # 请求体会是 body 数据格式
- GET /index http1.1\r\nhost:c1.com\r\n\r\nuser=alex&pwd=123
- 2.6 json, 传请求体
- requests.post(
- ...,
- json={'user':'alex','pwd':'123'}
- )
- # 请求体会是字典的格式
- GET /index http1.1\r\nhost:c1.com\r\nContent-Type:application/json\r\n\r\n{"user":"alex","pwd":123}
- 2.7 代理 proxies
- # 无验证
- proxie_dict = {
- "http": "61.172.249.96:80", # 所有的 http 代理指定
- "https://www.proxy360.cn/Proxy": "61.172.249.96:80", # 精准指定
- "https": "http://61.185.219.126:3128",
- }
- ret = requests.get("https://www.proxy360.cn/Proxy", proxies=proxie_dict)
- # 验证待认证的代理
- from requests.auth import HTTPProxyAuth
- proxyDict = {
- 'http': '77.75.105.165', # 批量的 http 请求都走这个代理
- 'http://www.google.com': '77.75.105.165', # 精准设置单地址走某代理
- 'https': '77.75.106.165' # 批量的 https 请求走这个
- }
- # 多创建一个认证对象
- auth = HTTPProxyAuth('用户名', '密码')
- r = requests.get("http://www.google.com",data={'xxx':'ffff'} proxies=proxyDict, auth=auth)
- print(r.text)
- -----------------------------------------------------------------------------------------
- 2.8 文件上传 files
- # 发送文件
- file_dict = {
- 'f1': open('xxxx.log', 'rb')
- }
- requests.request(
- method='POST',
- url='http://127.0.0.1:8000/test/',
- files=file_dict
- )
- 2.9 认证 auth
- 内部:
- 用户名和密码, 用户和密码加密, 放在请求头中传给后台
- - "用户: 密码"
- - base64("用户: 密码")
- - "Basic base64(" 用户 | 密码 ")"
- - 请求头:
- Authorization: "basic base64(" 用户 | 密码 ")"
- from requests.auth import HTTPBasicAuth, HTTPDigestAuth
- ret = requests.get('https://api.github.com/user', auth=HTTPBasicAuth('admin', 'admin'))
- print(ret.text)
- 2.10 超时 timeout
- # ret = requests.get('http://google.com/', timeout=1) # 链接时间限制 1s
- # print(ret)
- # ret = requests.get('http://google.com/', timeout=(5, 1)) # 链接 5s 返回 1s 超时不候
- # print(ret)
- 2.11 允许重定向 allow_redirects 直到拿到结果为止
- ret = requests.get('http://127.0.0.1:8000/test/', allow_redirects=False)
- print(ret.text)
- 2.12 大文件下载 stream
- from contextlib import closing
- with closing(requests.get('http://httpbin.org/get', stream=True)) as r1:
- # 在此处理响应.
- for i in r1.iter_content():
- print(i)
- 2.13 证书 cert
- - 百度, 腾讯 => 不用携带证书 (系统帮你做了)
- - 自定义证书
- requests.get('http://127.0.0.1:8000/test/', cert="xxxx/xxx/xxx.pem")
- requests.get('http://127.0.0.1:8000/test/', cert=("xxxx/xxx/xxx.pem","xxx.xxx.xx.key"))
- 2.14 确认 verify =False
- """requests.get('http://127.0.0.1:8000/test/', cert="xxxx/xxx/xxx.pem")
示例
- import requests
- from urllib.parse import urlencode
- # 请求方式
- kwords = input("请输入关键字:>>").strip()
- res = urlencode({"wd":kwords}) # # 请求的 url, 当你在百度输入中文的时候, 你把 url 拿下来会变成下面的这样格式的 url, 所以得 urlencode 一下
- url ="https://www.baidu.com/s?"+res #https://www.baidu.com/s?wd=图片
- response = requests.get(
- # 请求的 url, 当你在百度输入中文的时候, 你把 url 拿下来会变成下面的这样格式的 url
- url,
- # 请求头
- headers={
- "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) ApplewebKit/537.36 (Khtml, like Gecko) Chrome/63.0.3239.108 Safari/537.36",
- },
- )
- with open("a.html","w",encoding="utf-8") as f:
- f.write(response.text)
- # print(response.status_code)
简单示例 1
- kwords = input("请输入关键字:>>").strip()
- response = requests.get(
- "https://www.baidu.com/s?",
- # 请求的 url, 当你在百度输入中文的时候, 你把 url 拿下来会变成下面的这样格式的 url
- params={
- "wd":kwords,
- 'pn':20
- },
- # 请求头
- headers={
- "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36",
- },
- )
- with open("b.html","w",encoding="utf-8") as f:
- f.write(response.text)
- # print(response.status_code)
简单示例 2
bs4 模块
基本使用
- from bs4 import BeautifulSoup
- import requests
- r1 = requests.get(
- ...
- )
- print(r1.text) # 取出来内容
- soup = BeautifulSoup(r1.text,'html.parser') # (内容, 解析器)
- # 标签对象
- # content_list = soup.find(name='div',id='content-list')
- # find 取到第一个 find_all 取到所有
- content_list = soup.find(name='div',attrs={"id":"content-list"})
- # [标签对象, 标签对象]
- item_list = content_list.find_all(name='div',attrs={'class':'item'})
- for item in item_list:
- a = item.find(name='a',attrs={'class':'show-content color-chag'})
- print(a.text.strip())
常用方法
- r1= requests.get( ...)
- soup = BeautifulSoup(r1.text,'html.parser')
- soup.find(name='a',attrs={
- 'class':'show-content color-chag'
- })
- soup.find_all(name='a',attrs={
- 'class':'show-content color-chag'
- })
简单爬虫示例
爬取抽屉, 以及自动登陆抽屉点赞
先查看首页拿到 cookie, 然后登陆要携带首页拿到的 cookie 才可以通过验证
- """"""
- # ################################### 示例一: 爬取数据 (携带请起头) ###################################
- """
- import requests
- from bs4 import BeautifulSoup
- r1 = requests.get(
- url='https://dig.chouti.com/',
- headers={
- 'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
- }
- )
- soup = BeautifulSoup(r1.text,'html.parser')
- content_list = soup.find(name='div',attrs={"id":"content-list"})
- item_list = content_list.find_all(name='div',attrs={'class':'item'})
- for item in item_list:
- a = item.find(name='a',attrs={'class':'show-content color-chag'})
- print(a.text.strip())
- """
- # ################################### 示例二: 登陆点赞 ###################################
- """
- import requests
- # 1. 查看首页
- r1 = requests.get(
- url='https://dig.chouti.com/',
- headers={
- 'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
- }
- )
- # 2. 提交用户名和密码
- r2 = requests.post(
- url='https://dig.chouti.com/login',
- headers={
- 'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
- },
- data={
- 'phone':'8613121758648',
- 'password':'woshiniba',
- 'oneMonth':1
- },
- cookies=r1.cookies.get_dict()
- # 套路 正常用户必然会先访问首页然后再登陆
- # 如果你直接登陆必然是爬虫, 因此设计在第一次访问首页的时候先创建 cookie 并且返回了回去
- # 并且要求你第二次访问的时候要带着这个 cookie
- )
- # 3. 点赞
- r3 = requests.post(
- url='https://dig.chouti.com/link/vote?linksId=20435396',
- headers={
- 'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
- },
- cookies=r1.cookies.get_dict()
- )
- print(r3.text)
- """
- # ############## 方式二 session 方式 ##############
- """
- # 用 session 自动封装好 cookie 不用在以后自己携带
- import requests
- session = requests.Session()
- i1 = session.get(url="http://dig.chouti.com/help/service")
- i2 = session.post(
- url="http://dig.chouti.com/login",
- data={
- 'phone': "8615131255089",
- 'password': "xxooxxoo",
- 'oneMonth': ""
- }
- )
- i3 = session.post(
- url="http://dig.chouti.com/link/vote?linksId=8589523"
- )
- print(i3.text)
- """
爬取拉勾网
请求头中存在自定义的验证字段, 要想办法拿到才可以正确爬取, 以及 Referer 的使用
- import re
- import requests
- """
- 密码加密了的时候
- 找 js 通过 python 实现加密方式
- 直接把加密后的密文拿来用
- """
- r1 = requests.get(
- url='https://passport.lagou.com/login/login.html',
- headers={
- 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
- }
- )
- """
- 有两个奇怪的东西, 是网站的防御机制
- 这两个数据必然是对方发给我们的
- 要不在响应头里面, 要不在响应体里面
- 响应头看不到. 那就去响应体里面找.
- """
- # 因为不是写在标签里面的. 只能用正则来拿了
- X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token ='(.*?)'", r1.text, re.S)[0]
- X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code ='(.*?)'", r1.text, re.S)[0]
- # print(X_Anti_Forge_Token, X_Anti_Forge_Code)
- r2 = requests.post(
- url='https://passport.lagou.com/login/login.json',
- headers={
- 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
- 'X-Anit-Forge-Code':X_Anti_Forge_Code,
- 'X-Anit-Forge-Token':X_Anti_Forge_Token,
- 'Referer': 'https://passport.lagou.com/login/login.html', # 上一次请求地址是什么? 很多网站会要求带着个才可以继续
- },
- data={
- "isValidate": True,
- 'username': '15131255089',
- 'password': 'ab18d270d7126ea65915c50288c22c0d', # 直接发密文了
- 'request_form_verifyCode': '',
- 'submit': ''
- },
- cookies=r1.cookies.get_dict()
- )
- print(r2.text)
自动登陆 GitHub
scrf_token 的验证
- """"""
- # ################################### 示例三: 自动登录 GitHub ###################################
- # 1. GET, 访问登录页面
- """
- - 去 HTML 中找隐藏的 Input 标签获取 csrf token
- - 获取 cookie
- """
- # 2. POST, 用户名和密码
- """
- - 发送数据:
- - csrf
- - 用户名
- - 密码
- - 携带 cookie
- """
- # 3. GET, 访问 https://github.com/settings/emails
- """
- - 携带 cookie
- """
- import requests
- from bs4 import BeautifulSoup
- # ##########################################################
- # 访问登陆页面, 获取 authenticity_token
- i1 = requests.get(
- url='https://github.com/login'
- )
- soup1 = BeautifulSoup(i1.text, features='lxml')
- tag = soup1.find(name='input', attrs={'name': 'authenticity_token'})
- authenticity_token = tag.get('value') # authenticity_token 拿到
- c1 = i1.cookies.get_dict()
- i1.close()
- # 携带 authenticity_token 和用户名密码等信息, 发送用户验证
- form_data = {
- "authenticity_token": authenticity_token, # 放在请求体中发过去
- "utf8": "",
- "commit": "Sign in",
- "login": "",
- 'password': ''
- }
- i2 = requests.post(
- url='https://github.com/session',
- data=form_data,
- cookies=c1
- )
- c2 = i2.cookies.get_dict()
- c1.update(c2) # 将两次的 cookie 整合一起
- i3 = requests.get('https://github.com/settings/repositories', cookies=c1)
- soup3 = BeautifulSoup(i3.text, features='lxml')
- list_group = soup3.find(name='div', class_='listgroup')
- from bs4.element import Tag
- for child in list_group.children:
- if isinstance(child, Tag):
- project_tag = child.find(name='a', class_='mr-1')
- size_tag = child.find(name='small')
- temp = "项目:%s(%s); 项目路径:%s" % (project_tag.get('href'), size_tag.string, project_tag.string, )
- print(temp)
总结
请求头:
- user-agent
- referer
- host
- cookie
特殊请起头, 查看上一次请求获取内容.
- 'X-Anit-Forge-Code':...
- 'X-Anit-Forge-Token':...
请求体:
- 原始数据
- 原始数据 + token
- 密文
- 找算法
- 使用密文
套路:
- post 登录获取 cookie, 以后携带 cookie
- get 获取未授权 cookie,post 登录携带 cookie 去授权, 以后携带 cookie
来源: http://www.bubuko.com/infodetail-2948111.html