- #!/usr/bin/env python
- # -*- coding:utf-8 -*-
- import re
- import time
- import urllib.request
- import conf as cf
- BASE_URL = 'https://github.com/offensive-security/exploitdb/releases'
- DOWNLOAD_LINK_PATTERN = 'href="(.*?)zip"rel="nofollow">'
- FIRST_PATTERN = r'</span><a rel="nofollow"href="(.*?)">Next.*'
- PAGE_PATTERN = r'>Previous</a><a rel="nofollow"href="(.*?)">Next.*'
- class MyCrawler:
- def __init__(self, base_url=BASE_URL, start_page="first 1 page"):
- self.base_url = base_url
- self.start_page = start_page
- # self.headers = apache_request_headers();
- # 对首页的爬取
- def first_page(self):
- try:
- req = urllib.request.Request(self.base_url)
- html = urllib.request.urlopen(req)
- doc = HTML.read().decode('utf8', 'ignore')
- next_page = re.search(FIRST_PATTERN, doc, re.M | re.I)
- print('Now working on page = {}\n'.format(self.start_page))
- time.sleep(5)
- self.fetch_download_link(self.base_url)
- self.start_page = next_page.group(1)
- # re.search(r'after = (.*?)">Next.*', next_page.group(1), re.M | re.I).group(1)
- self.base_url = next_page.group(1)
- # self.fetch_download_link(next_url)
- except urllib.error.HTTPError as err:
- print(err.msg)
- self.fetch_next_page()
- # 翻页
- def fetch_next_page(self):
- while True:
- try:
- req = urllib.request.Request(self.base_url)
- HTML = urllib.request.urlopen(req)
- doc = HTML.read().decode('utf8', 'ignore')
- next_page = re.search(PAGE_PATTERN, doc, re.M | re.I)
- print('Now working on page {}\n'.format(self.start_page))
- time.sleep(5)
- #翻页时等待 5 秒
- self.fetch_download_link(self.base_url)
- self.start_page = next_page.group(1)
- # re.search(r'after = (.*?)">Next.*', next_page.group(1), re.M | re.I).group(1)
- self.base_url = next_page.group(1)
- # self.fetch_download_link(next_url)
- except urllib.error.HTTPError as err:
- print(err.msg)
- break
- # 文件下载: 将下载链接存到文件中
- def fetch_download_link(self, Aurl):
- f = open('result.txt', 'a')
- req = urllib.request.Request(Aurl)
- HTML = urllib.request.urlopen(req)
- doc = HTML.read().decode('utf8')
- alist = list(set(re.findall(DOWNLOAD_LINK_PATTERN, doc)))
- for item in alist:
- url = "https://github.com/" + item + "zip"
- print('Storing {}'.format(url))
- f.write(url + '\n')
- time.sleep(7)
- f.close()
- def run(self):
- self.fetch_download_link()
- if __name__ == '__main__':
- mc = MyCrawler()
- mc.first_page()
来源: http://www.bubuko.com/infodetail-3213843.html