- #!/usr/bin/env python3
- #-*- coding=utf-8 -*-
- import requests
- import time
- import random
- import re
- import configparser
- import logging
- import logging.handlers
- import lxml.etree as etree
- import threading
- import queue
- import os.path
- DOUBAN_HEADERS = {
- 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
- 'Referer':'http://www.douban.com/search?cat=1019&q=%E5%AE%B3%E7%BE%9E',
- 'Accept-Language':'zh-CN,zh;q=0.8',
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.104 Safari/537.36',
- 'Accept-Encoding':'gzip, deflate',
- 'Host':'www.douban.com',
- 'Connection':'Keep-Alive'
- }
- IMAGE_HEADERS = {
- 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.104 Safari/537.36'
- }
- CNFG_FILE = 'douban_crawler.cfg'
- LOG_FILE = 'douban_crawler.log'
- MAX_LOG_SIZE = 1024 * 1024 #1MB
- LOG_BACKUP_COUNT = 3
- logger = logging.getLogger('crawler')
- logger.setLevel(logging.DEBUG)
- fh = logging.handlers.RotatingFileHandler(LOG_FILE,
- maxBytes=MAX_LOG_SIZE,
- backupCount=LOG_BACKUP_COUNT,
- encoding='utf-8')
- fh.setLevel(logging.DEBUG)
- ch = logging.StreamHandler()
- ch.setLevel(logging.INFO)
- formatter = logging.Formatter(
- "%(asctime)s - %(name)s - %(levelname)s - %(lineno)d - %(message)s")
- fh.setFormatter(formatter)
- ch.setFormatter(formatter)
- # add the handlers to logger
- logger.addHandler(fh)
- logger.addHandler(ch)
- DEBUG = logger.debug
- INFO = logger.info
- WARNING = logger.warning
- ERROR = logger.error
- class Parser_Douban_Group(threading.Thread):
- def __init__(self, url, queue,t_name = 'Parser Group'):
- threading.Thread.__init__(self,name=t_name)
- self.data = queue
- self.url = url
- self.s = requests.Session()
- def run(self):
- #解析网页
- INFO("{0} started!".format(self.getName()))
- co = 0
- htm = open_douban_page(self.url,self.s)
- try:
- parser = etree.HTMLParser(recover=True)
- text_dom = etree.fromstring(htm, parser)
- except Exception as e:
- ERROR('Parse douban page error: {0}'.format(e))
- #DEBUG('Page: {0}'.format(htm))
- else:
- group_name = ''.join(text_dom.xpath("//div[@id='group-info']/h1//text()")).strip()
- INFO('Group name: {0}'.format(group_name))
- div_node = text_dom.xpath("//tr[@class='']")
- for x in div_node:
- co = co + 1
- item = {}
- url = ''.join(x.xpath("child::td[@class='title']/a/attribute::href"))
- title = ''.join(x.xpath("child::td[@class='title']/a//text()"))
- auth = ''.join(x.xpath("child::td[@nowrap='nowrap']/a[@class='']//text()"))
- reply = ''.join(x.xpath("child::td[@class='']//text()"))
- time = ''.join(x.xpath("child::td[@class='time']//text()"))
- item['title'] = title
- item['url'] = url
- item['auth'] = auth
- item['reply'] = reply
- item['time'] = time
- #将数据依次存入队列
- self.data.put(item, block=True)
- DEBUG('{0} Put({1}) - ({2} ...)'.format(self.getName(), co,item['title'][:20]))
- #存入结束标志
- self.data.put({})
- INFO("{0} finished! put {1} topic to queue.".format(self.getName(), co))
- class Parser_Douban_Topic(threading.Thread):
- def __init__(self, topic_queue, content_queue, t_name = 'Parser Topic'):
- threading.Thread.__init__(self,name=t_name)
- self.topic_queue = topic_queue
- self.content_queue = content_queue
- self.s = requests.Session()
- def run(self):
- #解析网页
- INFO("{0} started!".format(self.getName()))
- co = 0
- coo = 0
- while True:
- try:
- #读取队列,最长等待5分钟
- val = self.topic_queue.get(True,300)
- if val:
- co = co + 1
- DEBUG('{0} Get({1}) - ({2} ...)'.format(self.getName(), co, val['title'][:20]))
- htm = open_douban_page(val['url'],self.s)
- try:
- parser = etree.HTMLParser(recover=True)
- text_dom = etree.fromstring(htm, parser)
- except Exception as e:
- ERROR('Parse douban page error: {0}'.format(e))
- #DEBUG('Page: {0}'.format(htm))
- else:
- topic_name = ''.join(text_dom.xpath("//div[@id='content']/h1//text()")).replace('\\n','').strip()
- DEBUG('Topic name: {0}'.format(topic_name))
- div_node = text_dom.xpath("//div[@class='topic-content']")
- img_list = div_node[0].xpath("descendant::img/attribute::src")
- for x in img_list:
- coo = coo + 1
- item = {}
- #url = ''.join(x.xpath("descendant::img/attribute::src"))
- item['title'] = topic_name + str(coo)
- item['url'] = x
- #将数据依次存入队列
- self.content_queue.put(item)
- DEBUG('{0} Put({1}) - ({2} ...)'.format(self.getName(), coo,item['title'][:20]))
- else:
- self.topic_queue.put({})
- INFO("{0} finished! get {1} topic from queue.".format(self.getName(), co))
- break
- except Exception as e:
- ERROR("{0} timeout! {1}".format(self.getName(), e))
- break
- #存入结束标志
- self.content_queue.put({})
- INFO("{0} finished! put {1} image to queue.".format(self.getName(), coo))
- class Save_Douban_Group(threading.Thread):
- def __init__(self, queue, folder_name = 'image', t_name = 'Storage'):
- threading.Thread.__init__(self,name=t_name)
- self.data = queue
- self.folder = folder_name
- self.s = requests.Session()
- def run(self):
- INFO("{0} started!".format(self.getName()))
- co = 0
- coo = 0
- while True:
- try:
- #读取队列,最长等待5分钟
- val = self.data.get(True,300)
- if val:
- co = co + 1
- #fp.write('<{0}>.{1} - {2}\\r{3}\\r{4}\\r\\n'.format(
- #co,val['title'],val['time'],val['url'],val['abr']))
- DEBUG('{0} Get({1}) - ({2} ...)'.format(self.getName(), co,val['title'][:20]))
- img_dt = open_douban_page(val['url'], self.s, ret_raw = True)
- img_nm = val['url'].split('/')[-1]
- if img_dt:
- fn = '{0}/{1}'.format(self.folder, img_nm)
- if not os.path.exists(fn):
- fp = open(fn, 'wb')
- fp.write(img_dt)
- fp.close()
- coo = coo + 1
- else:
- self.data.put({}) #仍然存入结束标识
- break
- except Exception as e:
- ERROR("{0} timeout! {1}".format(self.getName(), e))
- #break
- #fp.close()
- INFO("{0} finished! save image({1}/{2}).".format(self.getName(), coo, co))
- def open_douban_page(group_url, s, retries=3, ret_raw = False):
- #读取网页
- ret = ''
- try:
- cookies = dict(bid="RmFNKKPAd0s")
- if ret_raw:
- r = s.get(group_url, headers=IMAGE_HEADERS, stream=True)
- else:
- r = s.get(group_url, headers=DOUBAN_HEADERS, cookies=cookies)
- #print(r.cookies)
- r.raise_for_status()
- time.sleep(random.uniform(0.3, 1.5))
- except requests.ConnectionError as e:
- ERROR('Connect douban error({0}): {1}'.format(retries,e))
- retries = retries - 1
- if retries > 0:
- time.sleep(0.5)
- ret = open_douban_page(group_url, s, retries)
- except Exception as e:
- ERROR('Open douban url({0}) error: {1}'.format(group_url, e))
- else:
- #INFO('Open douban page finished! - {0}'.format(r.url))
- DEBUG('Request url: {0}'.format(group_url))
- if ret_raw:
- ret = r.raw.read()
- else:
- ret = r.text
- return ret
- def crawler_douban(group_url, folder_name, task_name):
- q_topic = queue.Queue()
- q_content = queue.Queue()
- parser_group_obj = []
- parser_topic_obj = []
- storage_pic_obj = []
- for i in range(1,2):
- parser_group_obj.append(Parser_Douban_Group(group_url, q_topic, '{0} {1}'.format(task_name, i)))
- for i in range(1,2):
- parser_topic_obj.append(Parser_Douban_Topic(q_topic, q_content, 'Parser Topic {0}'.format(i)))
- for i in range(1,3):
- storage_pic_obj.append(Save_Douban_Group(q_content, folder_name, 'Storage {0}'.format(i)))
- for obj in parser_group_obj:
- obj.start()
- for obj in parser_topic_obj:
- obj.start()
- for obj in storage_pic_obj:
- obj.start()
- for obj in parser_group_obj:
- obj.join()
- for obj in parser_topic_obj:
- obj.join()
- for obj in storage_pic_obj:
- obj.join()
- del q_topic
- del q_content
- if __name__ == '__main__':
- haixiu_hangzhou_url = 'http://www.douban.com/group/505137/'
- haixiu_url = 'http://www.douban.com/group/haixiuzu/'
- co =0
- while True:
- co = co + 1
- time.sleep(2.0)
- crawler_douban(haixiu_url, 'image', 'Parser HaiXiu Group ({0})'.format(co))
- input('Press any key to exit!')
- #该片段来自于http://www.codesnippet.cn/detail/0104201512043.html
来源: http://www.codesnippet.cn/detail/0104201512043.html