- #!/usr/bin/env python
- #coding=utf-8
- import urllib
- import re
- '''抓取www.meizitu.com的图片'''
- #http://www.meizitu.com/a/list_1_1.html
- #获取html内容
- def getHtml(url):
- file = urllib.urlopen(url)
- html = file.read()
- return html
- #匹配图片地址返回列表数组
- def getImg(html):
- reg = r'data-original="(.+?\\.jpg)"'
- imgre = re.compile(reg)
- imglist = re.findall(imgre,html)
- return imglist
- #下载图片
- def download(imgList,page):
- x = (page-1)*10
- for imgurl in imgList:
- print 'download file '+str(x)+' start'
- urllib.urlretrieve(imgurl, 'D:/python/meizitu/%s.jpg' %x)
- print 'download file '+str(x)+' end'
- x+=1
- # 获取图片数量,最好是10的倍数
- def getPicNum(number=10,page=1):
- pageSize = 10
- pageNumber = number/10
- while(page<=pageNumber):
- url = "http://www.meizitu.com/a/list_1_"+str(page)+".html"
- html = getHtml(url)
- print 'get %s html success' % page
- imageList = getImg(html)
- print 'get %s imageurl success' %page
- download(imageList,page)
- page=page+1
- #从第一页开始获取20张图片,可以自行定义从多少页开始进行获取
- getPicNum(number=20,page=1)
- raw_input("press enter")
- #该片段来自于http://www.codesnippet.cn/detail/2512201411395.html
来源: http://www.codesnippet.cn/detail/2512201411395.html