抓取dribbble图片

 
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
 
"""
# @brief: 抓取dribbble.com里的图片，包括附件。
#         主要用于平时在设计方面的材料收集。
 
"""
from BeautifulSoup import BeautifulSoup
from colorama import init, Fore #控制台颜色
from urllib2 import urlopen
from urllib import urlretrieve
from progressbar import *
import codecs
import time
import json
import os
import re
 
# 通过使用autoreset参数可以让变色效果只对当前输出起作用，输出完成后颜色恢复默认设置
init(autoreset=True)
 
class Dribbble:
  HOMEPAGE = 'http://dribbble.com/shots/everyone'
  TOP = 0  #0表示不限制 
  allPageList = []
  pageList = []
  hasUpdate = False
 
  def __init__(self, top=0):
    self.TOP = top
 
    now = time.strftime("%Y%m%d", time.localtime())
    # 获取本地缓存
    _pageList = self.cacheShots()
 
    if _pageList and len(_pageList) and (_pageList['update'] == str(now)):
      self.allPageList = _pageList['url']
           
    self.getAllShotsUrl()
    # 如果数据有更新，则保存数据
    if self.hasUpdate:
      self.allPageList.extend(self.pageList)
      self.cacheShots({
        'url': self.allPageList,
        'update': now
      })
 
    for item in self.pageList:
      if item:
        self.downShot(item)
 
  def getAllShotsUrl(self):
    pageUrl = self.HOMEPAGE + '?page='
    pageIndex = 1
    isEnd = False
 
    # python貌似没有三目
    if len(self.pageList):
      lastUpdateShotUrl = self.pageList[len(self.pageList) - 1]
    else:
      lastUpdateShotUrl = ''
 
    while not isEnd:
      # 获取当前页面所有作品的url
      shotsUrl = self.getShotUrl(pageUrl + str(pageIndex))
 
      if (self.TOP and pageIndex <= self.TOP) or not self.TOP:
        if len(shotsUrl):
          # 缓存得到更新的url
          tmp = []
 
          for url in shotsUrl:
            # 如果当前url和缓存中最后的url一样，则跳出。
            if url == lastUpdateShotUrl:
              isEnd = True
              break
            else:
              tmp.insert(0, 'http://dribbble.com' + url)
              self.hasUpdate = True
             
          tmp.extend(self.pageList)
          self.pageList = tmp
          #页数加1
          pageIndex += 1
        else:
          isEnd = True
      else:
        isEnd = True
 
  def getShotUrl(self, url):
    page = urlopen(url)
    soup = BeautifulSoup(page.read())
 
    ol = soup.find('ol',{'class': 'dribbbles group'})
 
    # 如果没有内容，直接返回空数组
    if not ol:
      return []
 
    links = ol.findAll('a',{'class': 'dribbble-link'})
 
    # 返回的结果
    result = []
 
    for i in links:
      try:
        href = i['href']
        if href:
          result.append(href)
      except:
        pass
 
    return result
 
  def downShot(self, url):
    #创建目录
    dirname = 'dribbble'
    try:
      os.mkdir(dirname)
    except Exception, e:
      pass
 
    page = urlopen(url)
    soup = BeautifulSoup(page.read())
 
    #正常展示的作品
    #例如 shotDefaultUrl = http://dribbble.s3.amazonaws.com/users/34934/screenshots/1268076/gym_preview_1x.jpg
    shotDefaultUrl = 'http:' + soup.find('div', {'class': 'single-img'}).find('img')['src']
    shot2XOriginUrl = re.sub(r'_1x', '', shotDefaultUrl)
 
    preUrl = re.match(r'.*\\/', shotDefaultUrl).group()
 
    #附件大图，这正是我要下载东西
    shotAttachments = soup.find('div', {'class': 'attachments'})
    shotAttachmentsList = []
    hasAttachmentShot = False
 
    if shotAttachments:
      shotAttachmentsList = shotAttachments.findAll('a')
 
    widgets = ['Progress: ', Percentage(), ' ', Bar(marker=RotatingMarker('>-=')),' ', ETA(), ' ', FileTransferSpeed()]
    pbar = ProgressBar(widgets=widgets)
 
    global hasReset
    hasReset = True
 
    def dlProgress(blockCount, blockSize, totalSize):
      pbar.maxval = totalSize
      global hasReset
      if hasReset:
        pbar.start()
        hasReset = False
 
      pbar.update(int(min(blockCount * blockSize, totalSize)))
 
    def retrieveImg(url, path, filename=0):
      if filename:
        print '\\n' + Fore.CYAN + filename
 
      urlretrieve(url, path, dlProgress)
 
      pbar.finish()
      hasReset = True
 
    for item in shotAttachmentsList:
      if item:
        fileIds = re.findall(r'\\d+$', item['href'])[0]
        filename = item.string
        filepath = dirname + '/' + filename
        #下载附件
        attachmentUrl = preUrl + 'attachments/' + fileIds + '/' +  filename
        if os.path.isfile(filepath):
          filepath = dirname + '/' + re.sub(r'\\.', '0.', filename)
 
        retrieveImg(attachmentUrl, filepath, filename)
        hasAttachmentShot = True
 
    if not hasAttachmentShot:
      filename = re.findall(r'[^/\\\\\\\\]+$', shot2XOriginUrl)[0]
      filepath = dirname + '/' + filename
 
      if os.path.isfile(filepath):
        filepath = dirname + '/' + re.sub(r'\\.', '0.', filename)
 
      retrieveImg(shot2XOriginUrl, filepath, filename)
 
  def cacheShots(self, data=0):
    if data:
      f = codecs.open('dribbble.json', 'w')
      f.write(json.dumps(data, indent=2, ensure_ascii=False))
      f.close()
    else:
      try:
        f = codecs.open('dribbble.json', 'r')
        data = json.loads(f.read())
      except:
        return False
 
    return data
 
if __name__ == '__main__':
  startTime = time.time()
  print Fore.CYAN + 'Downloading...', '\\n'
 
  Dribbble()
 
  print '\\n' + Fore.GREEN + 'Download OK!'
  endTime = time.time()
  # 秒数精确到小数点后两位
  print Fore.YELLOW + '共耗时:'.decode('utf-8'), Fore.YELLOW + '%.2f' %(endTime - startTime), Fore.YELLOW + '秒'.decode('utf-8')
#该片段来自于http://www.codesnippet.cn/detail/191120137263.html
来源: http://www.codesnippet.cn/detail/191120137263.html
与本文相关文章

暂无,快来抢沙发吧！