一个检测某网页依赖第三方资源的 python 脚本

 
#!/usr/bin/env python
# -*- coding: utf8 -*-
  
# 通过输入的网址获取其依赖的站点（html中引用到的）
# 依赖文件格式如下：
# *.microsoft.com
# *.outlook.com
# *.apple.com
# *.ibm.com
  
import urllib2
import urlparse
import socket
import sys
import re
  
def printHelp():
    print 'Approach 1: python DepSpy.py url dstfile'
    print '    * url starts with http:// or https://.'
    print '    * dstfile is the full name of output file,'
    print '      results output to stdin if dstfile is empty.'
    print '\\r\\nApproach 2: python DepSpy.py urlfile dstfile'
    print '    * urlfile is the full name of file listing input urls(splitted by \\\\n).'
    print '    * dstfile is the full name of output file,'
    print '      results output to stdin if dstfile is empty.'
  
# 根据命令行调用相应功能
def dispatch(args):
    try:
        if len(args) < 2:
            printHelp()
            return []
        elif len(args) == 2 and (['h', '/h', '-h', '?', '/?', '-?', 'help', '-help', '/help'].count(args[1]) != 0):
            printHelp()
        elif args[1].find(r'http://') == 0 or args[1].find(r'https://') == 0:
            # 命令行参数为一个网址
            return getDependHost(args[1])
        else:
            # 命令行参数为一个网址列表文件名
            urls = readURLList(args[1])
            ret = []
  
            for u in urls:
                print'---- Dealing with: ' + u + ' ----'
                lst = getDependHost(u)
                for it in lst:
                    if ret.count(it) == 0:
                        ret.append(it)
            return ret
    except Exception , e:
        print e
  
    return []
  
# 获取依赖站点
_pattern = re.compile(r'<(?:script|link).*(?:src|href)\\s?=\\s?"(https?://.+?)"')
_pwww = re.compile(r'^[a-z0-9-_]+\\.')
def getDependHost(url):
    try:
        if url.find('http://') != 0:
            url = 'http://' + url
  
        def getHost(str):
            netloc = urlparse.urlparse(str).netloc
              
            if netloc.find('baidu.com') != -1:
                # 百度的网址要单独处理
                return netloc
            elif netloc.count('.') < 2:
                return '*.' + netloc
            else:
                netloc, dummy = re.subn(_pwww, '*.', netloc)
  
            return netloc
  
        resp = urllib2.urlopen(url)
        html = resp.read()
        deps = _pattern.findall(html)
        deps = map(getHost, deps)
        selfHost = getHost(url)
        ret = []
          
        for it in deps:
            if ret.count(it) == 0 and selfHost != it:
                ret.append(it)
          
        print ret
        return ret
    except Exception , e:
        print e
      
    return []
  
# 读取网址列表
def readURLList(path):
    fp = open(path, 'r')
    urls = []
    try:
        urls = fp.read().replace('\\r', '').replace('*', 'www').split('\\n')
    finally:
        fp.close()
    return urls
  
# 程序入口
if __name__ == '__main__':
    socket.setdefaulttimeout(60) # 全局超时设置
    lst = dispatch(sys.argv)
  
    if len(sys.argv) > 2:
        try:
            distFilename = sys.argv[2]
            fp = open(distFilename, 'w')
              
            for it in lst:
                fp.write(it + '\\r\\n')
          
            fp.close()
        except Exception , e:
            print 'Write File Error'
    else:
        try:
            for it in lst:
                print it
        except Exception , e:
            print 'Error'
#该片段来自于http://www.codesnippet.cn/detail/1206201512850.html
来源: http://www.codesnippet.cn/detail/1206201512850.html
与本文相关文章

暂无,快来抢沙发吧！