我也来个下美女图片的脚本

 
#!/usr/bin/env python
# -*- coding:utf-8 -*- 
 
import urllib2, urllib;
from threading import Thread;
import re, os;
 
'''
python是个神器
'''
class Spider(Thread):
     
    def __init__(self, url, output = "img/"):
        Thread.__init__(self);
        self.url = url;
        self.output = output;
        self.content = "";
        self.urls = [];
        self.imgs = [];
 
    def run(self):
        self.getContent();
        self.getUrls();
        self.getImgs();
        if os.path.exists(self.output): 
            for img in self.imgs:
                print "download: %s" % img;
                filename = self.output + img.split("/")[-1];
                urllib.urlretrieve(img, filename, None);
        else :
            print "the output file isn't found";
 
    def getUrls(self):
        urls = re.findall("<a href=\\"(.*?)\\".*?>", self.content);
        self.urls.extend([url for url in set(urls) if url.count("http:") == 1]);
 
    def getImgs(self):
        images = re.findall("<img src=\\"(.*?)\\".*?>", self.content);
        self.imgs.extend([image for image in set(images) if image.count("http:") == 1]);
 
    def getContent(self):
        try:
            handler = urllib2.urlopen(self.url, None, 5);
            content = handler.read();
            self.content = content;
            handler.close();
        except urllib2.URLError, e:
            print "can't find the url: [%s]" % self.url;
 
    def getThreads(self):
        self.getContent();
        self.getUrls();
        self.getImgs();
        threads = [self];
        for url in self.urls :
            threads.append(Spider(url));
        return threads;
 
 
def download(url):
    spider = Spider(url);
    threads = spider.getThreads();
    for thread in threads:
        print "begin thread: %s " % thread.url;
        thread.start();
 
if __name__ == "__main__":
    download("http://www.22mm.cc/");
#该片段来自于http://www.codesnippet.cn/detail/251020136688.html
来源: http://www.codesnippet.cn/detail/251020136688.html
与本文相关文章

暂无,快来抢沙发吧！