一. 爬虫的基本流程:
# 1, 发起请求:
使用 http 库向目标站点发起请求, 即发送一个 Request
Request 包含: 请求头, 请求体等
# 2, 获取响应内容
如果服务器能正常响应, 则会得到一个 Response
Response 包含: html,JSON, 图片, 视频等
# 3, 解析内容
解析 HTML 数据: 正则表达式, 第三方解析库如 Beautifulsoup,pyquery 等
解析 JSON 数据: JSON 模块
解析二进制数据: 以 b 的方式写入文件
# 4, 保存数据
数据库
文件
二. 我们来爬一个校花网
- import requests
- import re
- # 爬虫三部曲
- # 一 发送请求
- def get_page(url):
- index_res = requests.get(url)
- return index_res.text
- # 二 解析数据
- # 解析主页
- def parse_index(index_page):
- detail_urls = re.findall('.*?href="(.*?)"'
- , index_page, re.S)
- # print(detail_urls)
- for detail_url in detail_urls:
- if not detail_url.startswith('http'):
- detail_url = 'http://www.xiaohuar.com' + detail_url
- yield detail_url
- # 解析详情页
- def parse_detail(detail_page):
- video_urls = re.findall('id="media".*?src="(.*?)"', detail_page, re.S)
- if video_urls:
- video_urls = video_urls[0]
- if video_urls.endswith('.mp4'):
- yield video_urls
- # print(video_urls)
- # 三 保存数据
- import uuid
- def save_video(video_url):
- try:
- res = requests.get(video_url)
- with open(r'D:\pachong\movies\%s.mp4' % uuid.uuid4(), 'wb') as f:
- f.write(res.content)
- f.flush()
- except Exception:
- pass
- if __name__ == '__main__':
- base_url = 'http://www.xiaohuar.com/list-3-{}.html'
- for line in range(5):
- index_url = base_url.format(line)
- index_page = get_page(index_url)
- detail_urls = parse_index(index_page)
- for detail_url in detail_urls:
- detail_page = get_page(detail_url)
- video_urls = parse_detail(detail_page)
- for video_url in video_urls:
- save_video(video_url)
并发版:
- # pip3 install requests
- import requests
- import re
- from concurrent.futures import ThreadPoolExecutor
- pool = ThreadPoolExecutor(50)
- # 爬虫三部曲
- # 一 发送请求
- def get_page(url):
- print('%s GET start ...' % url)
- index_res = requests.get(url)
- return index_res.text
- # 二 解析数据
- # 解析主页
- def parse_index(index_page):
- # 拿到主页的返回结果
- res = index_page.result()
- detail_urls = re.findall('.*?href="(.*?)"'
- , res, re.S)
- # print(detail_urls)
- for detail_url in detail_urls:
- if not detail_url.startswith('http'):
- detail_url = 'http://www.xiaohuar.com' + detail_url
- pool.submit(get_page, detail_url).add_done_callback(parse_detail)
- # yield detail_url
- # 解析详情页
- def parse_detail(detail_page):
- res = detail_page.result()
- video_urls = re.findall('id="media".*?src="(.*?)"', res, re.S)
- if video_urls:
- video_urls = video_urls[0]
- if video_urls.endswith('.mp4'):
- pool.submit(save_video, video_urls)
- # print(video_urls)
- # 三 保存数据
- import uuid
- def save_video(video_url):
- try:
- res = requests.get(video_url)
- with open(r'D:\tank\day01\movies\%s.mp4' % uuid.uuid4(), 'wb') as f:
- f.write(res.content)
- f.flush()
- print('%s done ...' % video_url)
- except Exception:
- pass
- if __name__ == '__main__':
- base_url = 'http://www.xiaohuar.com/list-3-{}.html'
- for line in range(5):
- index_url = base_url.format(line)
- pool.submit(get_page, index_url).add_done_callback(parse_index)
三. request 的基本使用
1.get 请求的两种方式:
- import requests
- from urllib.parse import urlencode
- # 请求 url
- base_url = 'https://www.baidu.com/s?' + urlencode({"wd": "美女"})
- # 请求头
- headers = {
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) ApplewebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
- }
- # 请求方法 GET
- res = requests.get(base_url, headers=headers)
- # print(res) 一个 response 对象 # print(res.text) 整个 HTML 文本 # print(res.content) 二进制内容 with open('meinv.html', 'w', encoding='utf-8') as f:
- f.write(res.text)
每次 url 编码会很麻烦, 所以可以在 GET 内添加参数即可:
- import requests
- # 请求 url
- base_url = 'https://www.baidu.com/s?'
- # # 请求头
- headers = {
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
- }
- #
- # 请求方法 GET
- res = requests.get(base_url, headers=headers, params={"wd": "黄云"})
- with open('小云云. html', 'w', encoding='utf-8') as f:
- f.write(res.text)
get 请求访问知乎:
- # 访问知乎
- # 请求 url
- zhi_url = 'https://www.zhihu.com/explore'
- # # 请求头
- headers = {
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
- }
- # 请求方法 GET
- res = requests.get(zhi_url, headers=headers)
- with open('知乎. html', 'w', encoding='utf-8') as f:
- f.write(res.text)
get 请求访问 GitHub:
- # # 请求头, 登录后的主页
- url='https://github.com/settings/emails'
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',
- 'Cookie': 'has_recent_activity=1; _ga=GA1.2.1150787574.1561264746; _octo=GH1.1.800236184.1561264778; _device_id=e38cc770a7f91ac7001f3b1e23185943; user_session=HtVIP7s1AnJA8pBp9PPJN5onsJZ_AJ0mnhXKm-IkGuPYMzDi; __Host-user_session_same_site=HtVIP7s1AnJA8pBp9PPJN5onsJZ_AJ0mnhXKm-IkGuPYMzDi; logged_in=yes; dotcom_user=pengsima; _gat=1; tz=Asia/Shanghai; _gh_sess=U0hueWR2WmcvMEJ3amVCTFpOVm5KUDFob1FQUHBtd1BYK09ENkU0aTBqK1JrYmFiYTd6K3pLb0pSVDV5UzdOU0oxbGluSDR3dmVJYTA3WlVpaHZ2cWJmQTJrVTQzRHVFa1cvT1hrWG1ON1ZMRm1DeEtkQkhDRUVaK2cwUUpRN29UUnlyWnRCODQ3cTRLYWZkcmN5UHdnPT0tLUgxSmxJMUQzWDllblhFT3JMK083Tnc9PQ==--92e621b5b1d19cf03e157bf61e02ded6a1a248c6'
- }
- # # 请求头, email
- headers_2 = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',
- 'Cookie':'has_recent_activity=1; _ga=GA1.2.1150787574.1561264746; _octo=GH1.1.800236184.1561264778; _device_id=e38cc770a7f91ac7001f3b1e23185943; user_session=HtVIP7s1AnJA8pBp9PPJN5onsJZ_AJ0mnhXKm-IkGuPYMzDi; __Host-user_session_same_site=HtVIP7s1AnJA8pBp9PPJN5onsJZ_AJ0mnhXKm-IkGuPYMzDi; logged_in=yes; dotcom_user=pengsima; _gat=1; tz=Asia/Shanghai; _gh_sess=SE5mdjlBaWtla3B2czNYZFI5UTF6TEhUbERvellXVTZnUVE3d0hjTDBTb3RtZ0UxTXhYSCt4S2h2NXR2c3h2YVNaZUNITHlCOE9GcmhIM2lweVFVellYMExxV3dEK0R1ZU15cUEycmxIRk4yZW1WT2J5c3hFVHZ4Y3ZOaUhBN0ZseWcyTmMwNWxPTEIrMmpnVVpKRUJRPT0tLTdNcFZsOTFidnpxZk05cWVZUmV0MkE9PQ==--6064098de4400f5a7ac71cdd3806abd11b2a0134'
- }
- # 请求方法 GET
- # res = requests.get(url, headers=headers_2)
- res = requests.get(url, headers=headers)
- with open('github.html', 'w', encoding='utf-8') as f:
- f.write(res.text)
- print('1059239165' in res.text)
- # True
2.post 请求
- # 第一步 https://github.com/login>>>> 获取 tocken
- headers = {
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
- }
- login_res = requests.get('https://github.com/login', headers=headers)
- #
- authenticity_token = re.findall('name="authenticity_token".*?value="(.*?)"', login_res.text, re.S)[0]
- print(
- authenticity_token
- )
- # 第二步拿到 cookies
- cookies = {}
- # 把 login_cookies 放进 cookies 字典内
- cookies.update(login_res.cookies.get_dict())
- print(cookies)
- # 第三步 往 session 发送 post 请求
- # 请求方法 POST
- # 请求 url
- # https://github.com/session POST
- # 请求体
- form_data = {
- "commit": "Sign in",
- "utf8": "?",
- "authenticity_token": authenticity_token,
- "login": "pengsima",
- "password": "oa09116611",
- "webauthn-support":"supported"
- }
- # JSON
- # requests.post('https://github.com/session', headers=headers, JSON=form_data)
- res = requests.post('https://github.com/session', headers=headers, data=form_data, cookies=cookies)
- # print(res.status_code)
- with open('github.html', 'w', encoding='utf-8') as f:
- f.write(res.text)
- 3.response
- import requests
- baidu = 'https://www.baidu.com/'
- headers = {
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
- }
- res = requests.get(baidu, headers=headers)
- # 返回响应状态码
- print(res.status_code)
- print(res)
- # 响应头
- print(res.headers)
- # 响应文本
- print(res.text)
- print(res.url)
- #
- print(res.cookies)
- print(res.cookies.get_dict())
- print(res.encoding)
- # res.encoding = 'utf-8'
- # print(res.encoding)
- print(res.history)
- print(res.content)
下载一张图片:
- bo = 'https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1551942493340&di=afa19a1f5a3a4fbdec983baaeb1954f0&imgtype=0&src=http://www.xnnews.com.cn/wenyu/lxsj/201611/W020161114828261827516.jpg'
- headers = {
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
- }
- res = requests.get(bo, headers=headers, stream=True)
- with open('bo2.jpg', 'wb') as f:
- for line in res.iter_content():
- # f.write(res.content)
- f.write(line)
补充:
取消重定向 (默认为 True):
allow_redriects=False
来源: http://www.bubuko.com/infodetail-3102079.html