- #!/usr/bin/env python
- #-*- coding:utf-8 -*-
- """
- copy the UI of site specified by user
- """
- import sys
- reload(sys)
- sys.setdefaultencoding('utf-8')
- import os
- import os.path
- import re
- import errno
- from Queue import Queue
- import threading
- from urlparse import urlparse, urljoin
- import requests
- def make_sure_path_exists(path):
- try:
- os.makedirs(path)
- except OSError as exception:
- if exception.errno != errno.EEXIST:
- raise
- def write_to_file(filename, content, mode='w'):
- make_sure_path_exists(os.path.dirname(filename))
- try:
- with open(filename, mode) as f:
- f.write(content)
- except IOError:
- print 'can not open file : %s' % filename
- def get_login_cookie(login_url, post_data):
- r = requests.post(login_url, data=post_data)
- return r.cookies
- class BaseHandler(object):
- def __init__(self, url, content, queue):
- self._url = url
- self._content = content
- self._queue = queue
- def _URL_normalization(self, url):
- url = url.strip('"\\' ')
- o = urlparse(url)
- if not o.netloc:
- url = urljoin(self._url, url)
- return url
- def save(self, filename, mode='w'):
- write_to_file(filename, self._content, mode)
- class CSSHandler(BaseHandler):
- def __init__(self, url, content, queue):
- super(CssHandler, self).__init__(url, content, queue)
- def convert_path(self):
- content = self._content
- p = re.compile(r'url\\((.+?)\\)', re.I)
- ret = ''
- last_idx = 0
- for m in p.finditer(self._content):
- url = m.group(1)
- url = self._URL_normalization(url)
- # filename = urlparse(url).path.lstrip(' /')
- filename = 'css-url' + urlparse(url).path
- if filename.endswith('.css'):
- file_type = 'css'
- dir_prefix = '../css/'
- elif filename.endswith('.js'):
- file_type = 'js'
- dir_prefix = '../js/'
- else:
- file_type = 'img'
- dir_prefix = '../img/'
- self._queue.put((url, file_type, filename.replace('/', os.sep)))
- s = m.start(1)
- e = m.end(1)
- ret += content[last_idx: s] + dir_prefix + filename
- last_idx = e
- ret += content[e:]
- self._content = ret
- class htmlHandler(BaseHandler):
- def __init__(self, url, content, queue):
- super(HtmlHandler, self).__init__(url, content, queue)
- def convert_path(self):
- html = self._content
- img_regex = re.compile(r'<img[^>]*? src1="(.+?)"', re.I)
- link_regex = re.compile(r'<link[^>]*? href="(.+?)"', re.I)
- js_regex = re.compile(r'<script[^>]*? src1="(.+?)"', re.I)
- for regex in (img_regex, link_regex, js_regex):
- ret = ''
- last_idx = 0
- for m in regex.finditer(html):
- url = m.group(1)
- url = self._URL_normalization(url)
- filename = os.path.basename(urlparse(url).path)
- if filename.endswith('.css'):
- file_type = 'css'
- dir_prefix = 'static/css/'
- elif filename.endswith('.js'):
- file_type = 'js'
- dir_prefix = 'static/js/'
- else:
- file_type = 'img'
- dir_prefix = 'static/img/'
- self._queue.put((url, file_type, filename.replace('/', os.sep)))
- s = m.start(1)
- e = m.end(1)
- ret += html[last_idx: s] + dir_prefix + filename
- last_idx = e
- ret += html[e:]
- html = ret
- self._content = html
- class Downloader(threading.Thread):
- def __init__(self, t_name, queue, base_dir, js_dir,
- css_dir, img_dir, cookies=''):
- super(Downloader, self).__init__(name=t_name)
- self._queue = queue
- self._base_dir = base_dir
- self._js_dir = js_dir
- self._css_dir = css_dir
- self._img_dir = img_dir
- self._cookies = cookies
- def run(self):
- while True:
- url, file_type, filename = self._queue.get()
- try:
- r = requests.get(url, cookies=self._cookies)
- except requests.exceptions.RequestException:
- print 'download page error: %s' % url
- self._queue.task_done()
- continue
- except:
- print 'other exception'
- self._queue.task_done()
- continue
- if file_type == 'html':
- h = HtmlHandler(url, r.content, self._queue)
- h.convert_path()
- h.save(os.path.join(self._base_dir, filename))
- elif file_type == 'js':
- write_to_file(os.path.join(self._js_dir, filename), r.content)
- elif file_type == 'css':
- c = CssHandler(url, r.content, self._queue)
- c.convert_path()
- c.save(os.path.join(self._css_dir, filename))
- elif file_type == 'img':
- write_to_file(os.path.join(self._img_dir, filename),
- r.content, 'wb')
- else:
- print 'unkonwn type'
- print 'download complete: %s' % url
- self._queue.task_done()
- class SiteCopy(object):
- def __init__(self, thread_num, start_url, cookies=''):
- super(SiteCopy, self).__init__()
- self._thread_num = thread_num
- self._start_url = start_url
- self._cookies = cookies
- self._queue = Queue()
- def run(self):
- o = urlparse(self._start_url)
- base_dir = (o.netloc + o.path).replace(os.sep, '-')
- js_dir = os.path.join(base_dir, 'static', 'js')
- css_dir = os.path.join(base_dir, 'static', 'css')
- img_dir = os.path.join(base_dir, 'static', 'img')
- self._queue.put((self._start_url, 'html', 'index.html'))
- for i in range(0, self._thread_num):
- task = Downloader('Downloader', self._queue, base_dir, js_dir,
- css_dir, img_dir, self._cookies)
- task.daemon = True
- task.start()
- self._queue.join()
- if __name__ == '__main__':
- if len(sys.argv) != 2:
- print 'wrong arguments'
- sys.exit(-1)
- start_url = sys.argv[1]
- s = SiteCopy(10, start_url)
- s.run()
- #该片段来自于http://www.codesnippet.cn/detail/041120136891.html
来源: http://www.codesnippet.cn/detail/041120136891.html