- #-*- coding:utf-8 -*-
- import os
- import urllib2
- import cStringIO
- import Image
- import re
- import requests
- from lxml import etree
- Mozilla_header={
- 'Connection': 'Keep-Alive',
- 'Accept': 'text/html, application/xhtml+xml, */*',
- 'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
- 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
- }
- links=[] #遍历url的地址
- k=1
- print u'请输入最后的页数:'
- endPage=int(raw_input()) #最终的页数
- for j in range(1,endPage+1):
- if not os.path.exists('image'): #路径不存在时创建一个
- os.makedirs('image')
- url='http://www.mzitu.com/page/'+str(j) #页数的url地址
- req=urllib2.Request(url, headers = Mozilla_header) #读取首页的内容
- html = urllib2.urlopen(req).read()
- selector=etree.HTML(html) #转换为xml,用于在接下来识别
- links=selector.xpath('//li/a[@target="_blank"]/@href') #抓取当前页面的所有帖子的url
- hot=selector.xpath('//li//span[@class="view"]/text()')
- for i in range(len(links)):
- h=''
- for a in re.findall('\\d+',hot[i]):
- h+=a
- if int(h)>500000:
- req2 = urllib2.Request(links[i], headers = Mozilla_header)
- html2 = urllib2.urlopen(req2).read() #读取当前页面的内容
- selector2=etree.HTML(html2) #转换为xml用于识别
- page=selector2.xpath('//div[@class="pagenavi"]//span[last()-1]/text()')
- break_flag=0
- #此处就是遍历下载
- for k in range(1,int(page[0])):
- if break_flag:
- break
- req3 = urllib2.Request(links[i]+'/'+str(k), headers = Mozilla_header)
- html3 = urllib2.urlopen(req3).read() #读取当前页面的内容
- selector3=etree.HTML(html3) #转换为xml用于识别
- link_pic = selector3.xpath('//div[@class="main-image"]//@src')
- for each in link_pic:
- t=3
- req4 = urllib2.Request(each, headers = Mozilla_header)
- while(t):
- try:
- image1=urllib2.urlopen(req4,timeout=10).read() #读取图片的内容
- tmpIm = cStringIO.StringIO(image1)
- pic_name='image/'+each[7:].replace('/','_')
- if os.path.exists(pic_name):
- break_flag=1
- break
- else:
- print u'正在下载%s'%each
- fp=open(pic_name,'wb') #下载在当前目录下 image文件夹内,图片格式为jpg
- fp.write(image1) #写入图片
- fp.close()
- break
- except:
- t=t-1
- print u'下载完成!'
- #该片段来自于http://www.codesnippet.cn/detail/2801201614508.html
来源: http://www.codesnippet.cn/detail/2801201614508.html