- #!/usr/bin/env python2
- # -*- coding: utf-8 -*-
- """
- # @brief: 抓取dribbble.com里的图片,包括附件。
- # 主要用于平时在设计方面的材料收集。
- """
- from BeautifulSoup import BeautifulSoup
- from colorama import init, Fore #控制台颜色
- from urllib2 import urlopen
- from urllib import urlretrieve
- from progressbar import *
- import codecs
- import time
- import json
- import os
- import re
- # 通过使用autoreset参数可以让变色效果只对当前输出起作用,输出完成后颜色恢复默认设置
- init(autoreset=True)
- class Dribbble:
- HOMEPAGE = 'http://dribbble.com/shots/everyone'
- TOP = 0 #0表示不限制
- allPageList = []
- pageList = []
- hasUpdate = False
- def __init__(self, top=0):
- self.TOP = top
- now = time.strftime("%Y%m%d", time.localtime())
- # 获取本地缓存
- _pageList = self.cacheShots()
- if _pageList and len(_pageList) and (_pageList['update'] == str(now)):
- self.allPageList = _pageList['url']
- self.getAllShotsUrl()
- # 如果数据有更新,则保存数据
- if self.hasUpdate:
- self.allPageList.extend(self.pageList)
- self.cacheShots({
- 'url': self.allPageList,
- 'update': now
- })
- for item in self.pageList:
- if item:
- self.downShot(item)
- def getAllShotsUrl(self):
- pageUrl = self.HOMEPAGE + '?page='
- pageIndex = 1
- isEnd = False
- # python貌似没有三目
- if len(self.pageList):
- lastUpdateShotUrl = self.pageList[len(self.pageList) - 1]
- else:
- lastUpdateShotUrl = ''
- while not isEnd:
- # 获取当前页面所有作品的url
- shotsUrl = self.getShotUrl(pageUrl + str(pageIndex))
- if (self.TOP and pageIndex <= self.TOP) or not self.TOP:
- if len(shotsUrl):
- # 缓存得到更新的url
- tmp = []
- for url in shotsUrl:
- # 如果当前url和缓存中最后的url一样,则跳出。
- if url == lastUpdateShotUrl:
- isEnd = True
- break
- else:
- tmp.insert(0, 'http://dribbble.com' + url)
- self.hasUpdate = True
- tmp.extend(self.pageList)
- self.pageList = tmp
- #页数加1
- pageIndex += 1
- else:
- isEnd = True
- else:
- isEnd = True
- def getShotUrl(self, url):
- page = urlopen(url)
- soup = BeautifulSoup(page.read())
- ol = soup.find('ol',{'class': 'dribbbles group'})
- # 如果没有内容,直接返回空数组
- if not ol:
- return []
- links = ol.findAll('a',{'class': 'dribbble-link'})
- # 返回的结果
- result = []
- for i in links:
- try:
- href = i['href']
- if href:
- result.append(href)
- except:
- pass
- return result
- def downShot(self, url):
- #创建目录
- dirname = 'dribbble'
- try:
- os.mkdir(dirname)
- except Exception, e:
- pass
- page = urlopen(url)
- soup = BeautifulSoup(page.read())
- #正常展示的作品
- #例如 shotDefaultUrl = http://dribbble.s3.amazonaws.com/users/34934/screenshots/1268076/gym_preview_1x.jpg
- shotDefaultUrl = 'http:' + soup.find('div', {'class': 'single-img'}).find('img')['src']
- shot2XOriginUrl = re.sub(r'_1x', '', shotDefaultUrl)
- preUrl = re.match(r'.*\\/', shotDefaultUrl).group()
- #附件大图,这正是我要下载东西
- shotAttachments = soup.find('div', {'class': 'attachments'})
- shotAttachmentsList = []
- hasAttachmentShot = False
- if shotAttachments:
- shotAttachmentsList = shotAttachments.findAll('a')
- widgets = ['Progress: ', Percentage(), ' ', Bar(marker=RotatingMarker('>-=')),' ', ETA(), ' ', FileTransferSpeed()]
- pbar = ProgressBar(widgets=widgets)
- global hasReset
- hasReset = True
- def dlProgress(blockCount, blockSize, totalSize):
- pbar.maxval = totalSize
- global hasReset
- if hasReset:
- pbar.start()
- hasReset = False
- pbar.update(int(min(blockCount * blockSize, totalSize)))
- def retrieveImg(url, path, filename=0):
- if filename:
- print '\\n' + Fore.CYAN + filename
- urlretrieve(url, path, dlProgress)
- pbar.finish()
- hasReset = True
- for item in shotAttachmentsList:
- if item:
- fileIds = re.findall(r'\\d+$', item['href'])[0]
- filename = item.string
- filepath = dirname + '/' + filename
- #下载附件
- attachmentUrl = preUrl + 'attachments/' + fileIds + '/' + filename
- if os.path.isfile(filepath):
- filepath = dirname + '/' + re.sub(r'\\.', '0.', filename)
- retrieveImg(attachmentUrl, filepath, filename)
- hasAttachmentShot = True
- if not hasAttachmentShot:
- filename = re.findall(r'[^/\\\\\\\\]+$', shot2XOriginUrl)[0]
- filepath = dirname + '/' + filename
- if os.path.isfile(filepath):
- filepath = dirname + '/' + re.sub(r'\\.', '0.', filename)
- retrieveImg(shot2XOriginUrl, filepath, filename)
- def cacheShots(self, data=0):
- if data:
- f = codecs.open('dribbble.json', 'w')
- f.write(json.dumps(data, indent=2, ensure_ascii=False))
- f.close()
- else:
- try:
- f = codecs.open('dribbble.json', 'r')
- data = json.loads(f.read())
- except:
- return False
- return data
- if __name__ == '__main__':
- startTime = time.time()
- print Fore.CYAN + 'Downloading...', '\\n'
- Dribbble()
- print '\\n' + Fore.GREEN + 'Download OK!'
- endTime = time.time()
- # 秒数精确到小数点后两位
- print Fore.YELLOW + '共耗时:'.decode('utf-8'), Fore.YELLOW + '%.2f' %(endTime - startTime), Fore.YELLOW + '秒'.decode('utf-8')
- #该片段来自于http://www.codesnippet.cn/detail/191120137263.html
来源: http://www.codesnippet.cn/detail/191120137263.html