百度图片有些有编码问题, 暂时不能爬取, 多试几个
- # 思路: 抓取图片地址, 根据地址转存图片 (注意名称); 难点: 转码
- # -*- coding:utf-8 -*-
- from urllib import request,error
- import json,re
- # for page in range(4):
- # url = "http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=哈士奇&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&word=哈士奇&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&pn="+str(page*30)+"&rn=30&gsm=1e&1520997016315="
- # try:
- # response = request.urlopen(url).read().decode("utf-8")
- # print(type(response))
- #
- # except error.URLError as e:
- # print(e.reason)
- #
- class BaiduImg(object):
- def __init__(self):
- super(BaiduImg,self).__init__()
- print(开始采集图片)
- self.page = 30
- def request(self):
- while self.page <= 30:
- request_url=http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=哈士奇&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&word=哈士奇&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&pn=30&rn=30&gsm=1e&1520997014923=
- # print(len(request_url))
- headers = {
- user-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) ApplewebKit/537.36 (Khtml, like Gecko) Chrome/64.0.3282.140 Safari/537.36
- # Content-type:text/html
- # Content-type: text/html
- }
- req=request.Request(request_url,headers=headers)
- with request.urlopen(req) as f:
- if f.status == 200:
- content = f.read().decode(utf-8)
- content_dict = json.loads(content)
- self.download2(content_dict[data])
- self.page += 30
- #下载图片的方法
- def dowload(self,data):
- for image in data:
- if image.get(middleURL):
- url = image[middleURL]
- elif image.get(thumbURL):
- url = image[thumbURl]
- elif image.get(hoverURL):
- url = image[hoverURL]
- else:
- url=
- if url:
- data = request.urlopen(url).read()
- imageName = strip(image[fromPageTitleEnc])
- FileName = str(images/)+imageName+str(.jpg)
- with open(FileName,wb) as f:
- f.write(data)
- #使用 urllib.request.urlretrive() 保存图片
- # 使用 urllib.request.urlretrieve() 保存图片
- def download2(self, data):
- for image in data:
- if image.get(middleURL):
- url = image[middleURL]
- elif image.get(thumbURL):
- url = image[thumbURL]
- else:
- url = ""
- if url:
- imageName = strip(image[fromPageTitleEnc])
- filePath = str(images/) + imageName + str(.jpg)
- request.urlretrieve(url, filePath)
- # 过滤函数
- def strip(path):
- path = re.sub(r[?\\*|"<>:/!?],,str(path))
- return path
- if __name__ == __main__:
- bi = BaiduImg()
- bi.request()
来源: http://www.bubuko.com/infodetail-2530062.html