运用 python3 中的 urllib 爬取贴吧的图片:
- import urllib
- import urllib.request
- import lxml
- import lxml.etree
- import re
- from urllib import parse
- # 抓取贴吧页面数量信息
- def gettiebalistnumbers(name): #计算搜索的关键词有多少页 输入名字 返回页数
- url="https://tieba.baidu.com/f?"
- headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) ApplewebKit/537.36 (Khtml, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE"} # header 字典形式
- Word = {"kw": name} # 接口 贴吧的名字
- Word = parse.urlencode(Word) # 编码成字符串
- url = url + Word # 拼接 url
- request = urllib.request.Request(url, headers=headers) # 发送请求
- # 也可以通过调用 Request.add_header() 添加 / 修改一个特定的 header
- request.add_header("Connection", "keep-alive") # 一直活着
- response = urllib.request.urlopen(request) # 打开请求
- data = response.read().decode("utf-8") # 读取数据
- print(response.code) # 可以查看相应状态码
- restr = "<span class=\"card_infoNum\">([\s\S]*?)</span>" # 正则这个贴吧有多少帖子
- regex = re.compile(restr, re.IGNORECASE)
- mylist = regex.findall(data) #寻找页面所有符合条件的
- tienumbers = mylist[0].replace(",","") #替换逗号
- tienumbers = eval(tienumbers) #str 转化为数字
- restr = "<span class=\"card_menNum\">([\s\S]*?)</span>" # 正则关注贴吧的数
- regex = re.compile(restr, re.IGNORECASE)
- mylist = regex.findall(data) # 寻找页面所有符合条件的
- Peoplenumbers = mylist[0].replace(",", "") # 替换逗号
- Peoplenumbers = eval(Peoplenumbers) # str 转化为数字
- return tienumbers,Peoplenumbers
- def gettiebalist(name): #抓取所有的符合 name 的页数 输入搜索关键词, 返回所有的页数 url
- numberstuple=gettiebalistnumbers(name) #(元组)
- tienumbers=numberstuple[1] #帖子的数量
- Word = {"kw": name} # 接口 贴吧的名字
- Word = parse.urlencode(Word) # 编码成字符串
- tiebalist = []
- if tienumbers%53==0: #生成页面列表
- for i in range(tienumbers//53):
- tiebalist.append("https://tieba.baidu.com/f?"+Word+"&pn="+str(i*50))
- else:
- for i in range(tienumbers//53+1):
- tiebalist.append("https://tieba.baidu.com/f?"+Word+"&pn="+str(i*50))
- #print(tiebalist)
- return tiebalist
- def geturllistformpage(url): # 抓取页面的每个帖子 url 输入一页 url 返回列表内的的所有 url
- headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);"}
- request = urllib.request.Request(url, headers=headers) # 发起请求,
- # 也可以通过调? Request.add_header() 添加 / 修改? 个特定的 header
- response = urllib.request.urlopen(request)
- data = response.read().decode("utf-8", "ignore") # 打开请求, 抓取数据
- # print(response.code) # 可以查看响应状态码
- restr = "<ul id=\"thread_list\"class=\"threadlist_bright j_threadlist_bright\">([\s\S]*?)<div class=\"thread_list_bottom clearfix\">" # 正则表达式,() 只要括号内的数据
- regex = re.compile(restr, re.IGNORECASE)
- mylist = regex.findall(data)
- # print(mylist[0])# 抓取整个表格
- restr = "href=\"/p/(\d+)\"" # 正则表达式,() 只要括号内的数据
- regex = re.compile(restr, re.IGNORECASE)
- urltitlelist = regex.findall(data) #抓取的 url 变化的数字
- urllist = []
- for title in urltitlelist:
- urllist.append("http://tieba.baidu.com/p/" + title) # 拼接链接 得到每个页面的帖子 url 列表
- return urllist
- def urllistfrompage(url): #一个帖子页面的所有页数, 输入一个帖子 url 返回一个帖子所有页数的 url 列表
- headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE"} # header 字典形式
- request = urllib.request.Request(url, headers=headers)
- response = urllib.request.urlopen(request)
- data = response.read()#.decode("utf-8","ignore")
- mytree=lxml.etree.HTML(data)
- numbers=eval(mytree.xpath("//*[@class =\"l_reply_num\"]//span[last()]/text()")[0])
- urllist=[]
- for i in range(1,numbers+1):
- urllist.append(url+"?pn="+str(i))
- return urllist
- def getjpglistfrompage(url): #输入一个分页的 url 提取所有的图片 url 并保存到本地
- headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE"} # header 字典形式
- request = urllib.request.Request(url, headers=headers)
- response = urllib.request.urlopen(request)
- data = response.read()
- mytree=lxml.etree.HTML(data)
- jpgurllist=mytree.xpath("//*[@class=\"BDE_Image\"]/@src")
- return jpgurllist
- name="关晓彤"
- jpgnumbers=0
- for souurl in gettiebalist(name):
- sousurl=geturllistformpage(souurl)
- for fenurl in sousurl:
- jpgallurl=urllistfrompage(fenurl)
- for rev in jpgallurl:
- for jpgurl in getjpglistfrompage(rev):
- jpgnumbers += 1
- urllib.request.urlretrieve(jpgurl, "jpg/" + str(jpgnumbers) + ".jpg")
来源: http://www.bubuko.com/infodetail-3456458.html