- # -*- coding: utf-8 -*-
- import sys
- import re, urllib2, time, os.path, threading, Queue, cookielib, os, urllib, MySQLdb
- import socket
- socket.setdefaulttimeout(30)
- opener = None
- mQ = Queue.Queue()
- pQ = Queue.Queue()
- iQ = Queue.Queue()
- base = 'http://tieba.baidu.com'
- url = base + '/f?kw=%BD%E3%CD%D1&tp=0&pn='
- epLinkTemp = base + '%s/?see_lz=1&pn='
- store_path = "/home/pic/tieba/"
- class ThreadMainPage(threading.Thread):
- def __init__(self, mQ, pQ):
- threading.Thread.__init__(self)
- self.mQ = mQ
- self.pQ = pQ
- def getEPLink(self, host):
- try:
- html = opener.open(host,timeout=30).read()
- r_link = re.compile("<a href=\\"([^\\"]*?)\\" title=\\"[^\\"]*?\\" target=\\"_blank\\" class=\\"j_th_tit\\">")
- epLinks = []
- for x in r_link.findall(html):
- epLinks.append(epLinkTemp % x)
- return epLinks
- except socket.timeout:
- return []
- def run(self):
- while True:
- try :
- # print "pQ size:%s" % pQ.qsize()
- host = self.mQ.get(True,10)
- epLinks = self.getEPLink(host)
- for link in epLinks:
- self.pQ.put(link)
- self.mQ.task_done()
- except Queue.Empty:
- print 'mQ -----> Empty '
- except Exception:
- self.mQ.task_done()
- class ThreadEveryPage(threading.Thread):
- def __init__(self, pQ,iQ):
- threading.Thread.__init__(self)
- self.pQ = pQ
- self.iQ = iQ
- def getImageLink(self,shortLink):
- html = opener.open(""+shortLink).read()
- r_total = re.compile("<span class=\\"red\\">(.*?)</span>")
- r_img_link = re.compile("<img pic_type=\\"0\\" class=\\"BDE_Image\\" src=\\"([^\\"]*?)\\" [^>]*?>")
- t = r_total.findall(html)
- total = int(t[0])
- if total > 1:
- for i in range(1,total+1):
- html = opener.open(shortLink+str(i)).read()
- for link in r_img_link.findall(html):
- self.iQ.put(link)
- else:
- for link in r_img_link.findall(html):
- self.iQ.put(link)
- def run(self):
- while True:
- try:
- epLink = self.pQ.get(True,10)
- self.getImageLink(epLink)
- self.pQ.task_done()
- except Queue.Empty:
- print 'pQ -----> Empty '
- except Exception:
- self.pQ.task_done()
- class ThreadEveryImage(threading.Thread):
- def __init__(self,iQ):
- threading.Thread.__init__(self)
- self.iQ = iQ
- def getImage(self,imageLink):
- image_path =store_path + imageLink[imageLink.rindex('/')+1:]
- if os.path.exists(image_path):
- return
- try:
- urllib.urlretrieve(imageLink,image_path)
- except IOError:
- print ("Can't Download ImageURL:"+image_path)
- pass
- def run(self):
- while True:
- try:
- imageLink = self.iQ.get(True,10)
- self.getImage(imageLink)
- self.iQ.task_done()
- except Queue.Empty:
- print 'iQ -----> Empty '
- except Exception:
- self.iQ.task_done()
- def setOpener():
- cookieJar = cookielib.CookieJar()
- cookie_support = urllib2.HTTPCookieProcessor(cookieJar)
- global opener
- opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
- opener.addheaders = [('User-agent',
- ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7) '
- 'ApplewebKit/535.1 (KHTML, like Gecko) '
- 'Chrome/13.0.782.13 Safari/535.1'))
- ]
- def main():
- setOpener()
- for i in range(20):
- t = ThreadMainPage(mQ, pQ)
- t.setDaemon(True)
- t.start()
- for host in [url + str(i) for i in range(0, 500,50)]:
- mQ.put(host)
- for i in range(30):
- ep = ThreadEveryPage(pQ,iQ)
- ep.setDaemon(True)
- ep.start()
- for j in range(30):
- il = ThreadEveryImage(iQ)
- il.setDaemon(True)
- il.start()
- mQ.join()
- pQ.join()
- iQ.join()
- sys.exit(0)
- if __name__ == '__main__':
- main()
- //该片段来自于http://www.codesnippet.cn/detail/231020136596.html
来源: http://www.codesnippet.cn/detail/231020136596.html