爬美图

 
#-*- coding:utf-8 -*-
import os
import urllib2
import cStringIO
import Image
import re
import requests
from lxml import etree
  
Mozilla_header={
    'Connection': 'Keep-Alive',
    'Accept': 'text/html, application/xhtml+xml, */*',
    'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
  }
links=[]  #遍历url的地址
k=1
print u'请输入最后的页数：'
endPage=int(raw_input())  #最终的页数
  
for j in range(1,endPage+1):
  if not os.path.exists('image'):     #路径不存在时创建一个
    os.makedirs('image')
  url='http://www.mzitu.com/page/'+str(j)  #页数的url地址
  req=urllib2.Request(url,  headers = Mozilla_header)  #读取首页的内容
  html = urllib2.urlopen(req).read()
  selector=etree.HTML(html) #转换为xml，用于在接下来识别
  links=selector.xpath('//li/a[@target="_blank"]/@href')  #抓取当前页面的所有帖子的url
  hot=selector.xpath('//li//span[@class="view"]/text()')
  
  for i in range(len(links)):
    h=''
    for a in re.findall('\\d+',hot[i]):
        h+=a
    if int(h)>500000:
            req2 = urllib2.Request(links[i],  headers = Mozilla_header)
            html2 = urllib2.urlopen(req2).read()  #读取当前页面的内容
            selector2=etree.HTML(html2)  #转换为xml用于识别
            page=selector2.xpath('//div[@class="pagenavi"]//span[last()-1]/text()')
            break_flag=0
              
            #此处就是遍历下载
            for k in range(1,int(page[0])):
                if break_flag:
                    break
                req3 = urllib2.Request(links[i]+'/'+str(k),  headers = Mozilla_header)
                html3 = urllib2.urlopen(req3).read()  #读取当前页面的内容
                selector3=etree.HTML(html3)  #转换为xml用于识别
                link_pic = selector3.xpath('//div[@class="main-image"]//@src')
                  
                for each in link_pic:
                    t=3
                    req4 = urllib2.Request(each,  headers = Mozilla_header)
                    while(t):
                        try:
                            image1=urllib2.urlopen(req4,timeout=10).read() #读取图片的内容
                            tmpIm = cStringIO.StringIO(image1)
                            pic_name='image/'+each[7:].replace('/','_')
                            if os.path.exists(pic_name):
                                break_flag=1
                                break
                            else:
                                print u'正在下载%s'%each
                                fp=open(pic_name,'wb')  #下载在当前目录下 image文件夹内，图片格式为jpg
                                fp.write(image1)  #写入图片
                                fp.close()
                                break
                        except:
                            t=t-1
  
print u'下载完成!'
#该片段来自于http://www.codesnippet.cn/detail/2801201614508.html
来源: http://www.codesnippet.cn/detail/2801201614508.html
与本文相关文章

暂无,快来抢沙发吧！