- #!/usr/bin/env python
- # -*- coding: utf8 -*-
- # 通过输入的网址获取其依赖的站点(html中引用到的)
- # 依赖文件格式如下:
- # *.microsoft.com
- # *.outlook.com
- # *.apple.com
- # *.ibm.com
- import urllib2
- import urlparse
- import socket
- import sys
- import re
- def printHelp():
- print 'Approach 1: python DepSpy.py url dstfile'
- print ' * url starts with http:// or https://.'
- print ' * dstfile is the full name of output file,'
- print ' results output to stdin if dstfile is empty.'
- print '\\r\\nApproach 2: python DepSpy.py urlfile dstfile'
- print ' * urlfile is the full name of file listing input urls(splitted by \\\\n).'
- print ' * dstfile is the full name of output file,'
- print ' results output to stdin if dstfile is empty.'
- # 根据命令行调用相应功能
- def dispatch(args):
- try:
- if len(args) < 2:
- printHelp()
- return []
- elif len(args) == 2 and (['h', '/h', '-h', '?', '/?', '-?', 'help', '-help', '/help'].count(args[1]) != 0):
- printHelp()
- elif args[1].find(r'http://') == 0 or args[1].find(r'https://') == 0:
- # 命令行参数为一个网址
- return getDependHost(args[1])
- else:
- # 命令行参数为一个网址列表文件名
- urls = readURLList(args[1])
- ret = []
- for u in urls:
- print'---- Dealing with: ' + u + ' ----'
- lst = getDependHost(u)
- for it in lst:
- if ret.count(it) == 0:
- ret.append(it)
- return ret
- except Exception , e:
- print e
- return []
- # 获取依赖站点
- _pattern = re.compile(r'<(?:script|link).*(?:src|href)\\s?=\\s?"(https?://.+?)"')
- _pwww = re.compile(r'^[a-z0-9-_]+\\.')
- def getDependHost(url):
- try:
- if url.find('http://') != 0:
- url = 'http://' + url
- def getHost(str):
- netloc = urlparse.urlparse(str).netloc
- if netloc.find('baidu.com') != -1:
- # 百度的网址要单独处理
- return netloc
- elif netloc.count('.') < 2:
- return '*.' + netloc
- else:
- netloc, dummy = re.subn(_pwww, '*.', netloc)
- return netloc
- resp = urllib2.urlopen(url)
- html = resp.read()
- deps = _pattern.findall(html)
- deps = map(getHost, deps)
- selfHost = getHost(url)
- ret = []
- for it in deps:
- if ret.count(it) == 0 and selfHost != it:
- ret.append(it)
- print ret
- return ret
- except Exception , e:
- print e
- return []
- # 读取网址列表
- def readURLList(path):
- fp = open(path, 'r')
- urls = []
- try:
- urls = fp.read().replace('\\r', '').replace('*', 'www').split('\\n')
- finally:
- fp.close()
- return urls
- # 程序入口
- if __name__ == '__main__':
- socket.setdefaulttimeout(60) # 全局超时设置
- lst = dispatch(sys.argv)
- if len(sys.argv) > 2:
- try:
- distFilename = sys.argv[2]
- fp = open(distFilename, 'w')
- for it in lst:
- fp.write(it + '\\r\\n')
- fp.close()
- except Exception , e:
- print 'Write File Error'
- else:
- try:
- for it in lst:
- print it
- except Exception , e:
- print 'Error'
- #该片段来自于http://www.codesnippet.cn/detail/1206201512850.html
来源: http://www.codesnippet.cn/detail/1206201512850.html