并发爬虫小练习。
直接粘贴到本地,命名为. py 文件即可运行,运行时的参数为你想要爬取的用户。默认是本博客。
输出是以用户名命名的目录,目录内便是博客内容。
仅供学习 python 的多线程编程方法,后续会重写成并行爬虫。
爬虫代码如下:
- 1 # -*- coding:utf-8 -*-
- 2 from multiprocessing.managers import BaseManager
- 3 from pyquery import PyQuery
- 4 import os, sys, urllib
- 5 import re, random, logging, time
- 6 import Queue, threading, multiprocessing, threadpool
- 7
- 8 USER_NAME = 'kirai'
- 9 TOTAL_PAGE_NUMBER = 0
- 10 INT_REGEXP = re.compile('([\d]+)')
- 11 BASE_URL = 'http://www.cnblogs.com/'+USER_NAME+'/p/?page='
- 12 ARTICLE_REGEXP = re.compile('href=\"(http://www.cnblogs.com/'+USER_NAME+'/p/[\d]+.html)\"')
- 13 THREAD_NUMBER = multiprocessing.cpu_count() * 2
- 14 ARTICLE_URLS_MUTEX = threading.Lock()
- 15 ARTICLE_URLS = []
- 16
- 17 class ListWithLinkExtend(list):
- 18 def extend(self, value):
- 19 super(ListWithLinkExtend, self).extend(value)
- 20 return self
- 21
- 22 def get_total_page_number():
- 23 doc = PyQuery(url=BASE_URL)
- 24 return int(INT_REGEXP.findall(
- 25 doc.find('.pager .Pager').text())[0].encode('ascii'))
- 26
- 27 def get_page_url():
- 28 global TOTAL_PAGE_NUMBER
- 29 return map(lambda page: BASE_URL+str(page),
- 30 [i for i in range(1, TOTAL_PAGE_NUMBER+1)])
- 31
- 32 def get_article_url(idx):
- 33 url = PAGE_URLS[idx]
- 34 doc = PyQuery(url=url)
- 35 article_urls = ARTICLE_REGEXP.findall(str(doc.find('.PostList .postTitl2')))
- 36 return article_urls
- 37
- 38 def handle_result(request, result):
- 39 global ARTICLE_URLS_MUTEX, ARTICLE_URLS
- 40 try:
- 41 ARTICLE_URLS_MUTEX.acquire()
- 42 ARTICLE_URLS.append(result)
- 43 finally:
- 44 ARTICLE_URLS_MUTEX.release()
- 45
- 46 def cluster_process():
- 47 global ARTICLE_URLS
- 48 # list : urls
- 49 task_queue = Queue.Queue()
- 50 # str : path
- 51 result_queue = Queue.Queue()
- 52 KiraiManager.register('get_task_queue', callable=lambda: task_queue)
- 53 KiraiManager.register('get_result_queue', callable=lambda: result_queue)
- 54 manager = KiraiManager(address=('', 6969), authkey='whosyourdaddy')
- 55 manager.start()
- 56 manager.shutdown()
- 57 # article_flag, article_urls = get_article_url()
- 58
- 59 # a simple way.
- 60 def get_article(url):
- 61 html = urllib.urlopen(url).read()
- 62 return html, INT_REGEXP.findall(url)[0]
- 63
- 64 def save_article(request, result):
- 65 content = result[0]
- 66 file_name = result[1]
- 67 path = './' + USER_NAME + '/' + file_name + '.html'
- 68 try:
- 69 fp = file(path, 'w')
- 70 fp.writelines(content)
- 71 finally:
- 72 fp.close()
- 73
- 74 def thread_process():
- 75 global ARTICLE_URLS
- 76 os.mkdir(USER_NAME)
- 77 thread_pool = threadpool.ThreadPool(THREAD_NUMBER)
- 78 requests = threadpool.makeRequests(get_article, ARTICLE_URLS, save_article)
- 79 [thread_pool.putRequest(req) for req in requests]
- 80 thread_pool.wait()
- 81
- 82 def __main__(argv):
- 83 global ARTICLE_URLS, TOTAL_PAGE_NUMBER, USER_NAME, BASE_URL, ARTICLE_REGEXP, PAGE_URLS, TOTAL_PAGE_NUMBER
- 84 if len(argv) == 2:
- 85 USER_NAME = argv[1]
- 86 BASE_URL = 'http://www.cnblogs.com/'+USER_NAME+'/p/?page='
- 87 ARTICLE_REGEXP = re.compile('href=\"(http://www.cnblogs.com/'+USER_NAME+'/p/[\d]+.html)\"')
- 88 TOTAL_PAGE_NUMBER = get_total_page_number()
- 89 PAGE_URLS = get_page_url()
- 90 thread_pool = threadpool.ThreadPool(THREAD_NUMBER)
- 91 requests = threadpool.makeRequests(
- 92 get_article_url,
- 93 [i for i in range(0, TOTAL_PAGE_NUMBER)],
- 94 handle_result)
- 95 [thread_pool.putRequest(req) for req in requests]
- 96 thread_pool.wait()
- 97 ARTICLE_URLS = list(reduce(lambda a, b: ListWithLinkExtend(a).extend(ListWithLinkExtend(b)),
- 98 ARTICLE_URLS))
- 99 thread_process()
- 100
- 101 if __name__ == '__main__':
- 102 __main__(sys.argv)
简单介绍下全局变量的意义:
USER_NAME:希望爬取的用户名,默认为 kirai。
TOTAL_PAGE_NUMBER:会被更新成博客随笔的总页数。
INT_REGEXP:为了匹配数字的正则。
BASE_URL:随笔页的初始 URL。
ARTICLE_REGEXP:在经过 pyquery 处理过后的每个随笔目录页中提取出博客文章页面的正则。
THREAD_NUMBER:线程数,默认设置是本机 cpu 核数的 2 倍。
ARTICLE_URLS_MUTEX:ARTICLE_URLS 的锁,保证线程唯一占用。
ARTICLE_URLS:用于存放所有的文章 url。
来源: