- import re
- import urllib
- import urllib2
- from bs4 import BeautifulSoup
- print 'nihao'
- url = "http://zhiqq.com"
- s = urllib2.urlopen('http://zhiqq.com')
- s = s.read()
- htm = unicode(s,'gb2312','ignore').encode('utf-8','ignore')
- soup = BeautifulSoup(htm)
- sou = soup.prettify()
- f = open('C:/sou.txt','w')
- f.write(sou)
- n = 0
- for link in soup.find_all('a'):
- print link.get('href')
- print n
- n = n+1
- try:
- urllib2.urlopen(link.get('href')).getcode()
- except:
- print "*******connect failed"
- continue
- #该片段来自于http://www.codesnippet.cn/detail/060820134997.html
来源: http://www.codesnippet.cn/detail/060820134997.html