- #!/usr/bin/python
- ##########################################
- # Platform: Windows 8.1 64Bit #
- # Language: Python #
- # Version: 2.7 #
- ##########################################
- import httplib
- import re
- import sys
- def sendHttp(host,url):
- #添加http的协议头信息
- header = {"Content-type":"application/x-www-from-urlencoded","Accpet":"text/plain;charset=utf-8"}
- conn = httplib.HTTPConnection(host,80,True,10)
- conn.request('GET',url+"1",'',header)
- response = conn.getresponse()
- #第一次成功打开网址
- if response.status == 200 :
- body = response.read().decode('utf8')
- #计算分页,匹配:<p class="page">共 3669 条记录, 184 页
- totalpageregex = re.compile(u'"page">[\\u4e00-\\u9fa5]+\\s(\\d+).*?(\\d+)')
- totalpagegroup = totalpageregex.search(body)
- totalpage = 1
- if totalpagegroup :
- totalpage = totalpagegroup.group(2)
- print "total page:",totalpage
- print "start runing...",""
- fileDic={}
- memberCountDic={}
- for currentPage in range(1,int(totalpage)+1):
- conn.request('GET',url+repr(currentPage),'',header)
- response = conn.getresponse()
- print "status:",response.status,"http://"+host+url+repr(currentPage)
- if response.status == 200 :
- tr = re.compile(r'<tr>\\s+<th>(.*?)</th>\\s+<td>(.*?)</td>\\s+<th>(.*?)</th>\\s+<th>.*?</th>\\s+<th>.*?</th>\\s+<th>(.*?)</th>')
- body = response.read()
- tr = tr.findall(body)
- for item in tr:
- if item[2] in fileDic.keys():
- fp = fileDic[item[2]]
- memberCountDic[item[2]] += 1
- else:
- fileDic[item[2]] = open(item[2]+'.txt'.decode('utf8'),'w+')
- fp = fileDic[item[2]]
- memberCountDic[item[2]] = 1
- name = re.sub(r'<.*?>','',item[1])
- blog = re.search(r'href="(.*?)"',item[1],re.I).group(1)
- fp.write(item[0] + "|http://" + host + blog.decode('utf8') + "|" + name.decode('utf8') + "|Rank:" + item[3] + "\\n")
- fp.flush()
- #释放文件句柄
- fp.close()
- for k in fileDic.keys():
- fileDic[k].close();
- print "File successfully saved.",""
- #会员人数计算
- print "=================================================",""
- totalmember=0
- for k in memberCountDic.keys():
- print k.decode("utf8")+":",memberCountDic[k]
- totalmember += memberCountDic[k]
- print "total member:",totalmember
- print "=================================================",""
- else:
- print "Url open failed,status code",response.status
- conn.close()
- if __name__ == "__main__":
- reload(sys)
- sys.setdefaultencoding('utf8')
- host = 'www.wooyun.org'
- url = "/whitehats/page/"
- sendHttp(host,url)
- #该片段来自于http://www.codesnippet.cn/detail/261120137493.html
来源: http://www.codesnippet.cn/detail/261120137493.html