- #!/usr/bin/env python
- # -*- coding:utf-8 -*-
- import urllib2, urllib;
- from threading import Thread;
- import re, os;
- '''
- python是个神器
- '''
- class Spider(Thread):
- def __init__(self, url, output = "img/"):
- Thread.__init__(self);
- self.url = url;
- self.output = output;
- self.content = "";
- self.urls = [];
- self.imgs = [];
- def run(self):
- self.getContent();
- self.getUrls();
- self.getImgs();
- if os.path.exists(self.output):
- for img in self.imgs:
- print "download: %s" % img;
- filename = self.output + img.split("/")[-1];
- urllib.urlretrieve(img, filename, None);
- else :
- print "the output file isn't found";
- def getUrls(self):
- urls = re.findall("<a href=\\"(.*?)\\".*?>", self.content);
- self.urls.extend([url for url in set(urls) if url.count("http:") == 1]);
- def getImgs(self):
- images = re.findall("<img src=\\"(.*?)\\".*?>", self.content);
- self.imgs.extend([image for image in set(images) if image.count("http:") == 1]);
- def getContent(self):
- try:
- handler = urllib2.urlopen(self.url, None, 5);
- content = handler.read();
- self.content = content;
- handler.close();
- except urllib2.URLError, e:
- print "can't find the url: [%s]" % self.url;
- def getThreads(self):
- self.getContent();
- self.getUrls();
- self.getImgs();
- threads = [self];
- for url in self.urls :
- threads.append(Spider(url));
- return threads;
- def download(url):
- spider = Spider(url);
- threads = spider.getThreads();
- for thread in threads:
- print "begin thread: %s " % thread.url;
- thread.start();
- if __name__ == "__main__":
- download("http://www.22mm.cc/");
- #该片段来自于http://www.codesnippet.cn/detail/251020136688.html
来源: http://www.codesnippet.cn/detail/251020136688.html