- # -*- coding:utf-8 -*-
- #! /usr/bin/python
- import urllib
- import os, datetime, sys
- from bs4 import BeautifulSoup
- reload(sys)
- sys.setdefaultencoding( "utf-8" )
- __BASEURL__ = "<a href="http://bj.58.com/">http://bj.58.com/"
- __INITURL__ = "<a href="http://bj.58.com/hezu/">http://bj.58.com/hezu/"
- soup=BeautifulSoup(urllib.urlopen(__INITURL__))
- lv1Elements = soup.html.body.section.find('div', 'relative').find('dl', 'secitem')('a',href=True)
- f=open('data.txt', 'w')
- for element in lv1Elements[1:]:
- f.write((element.get_text() + '\\r\\n'))
- print element.get_text()
- url = __BASEURL__ + element.get('href')
- print url
- soup=BeautifulSoup(urllib.urlopen(url))
- lv2Elements = soup.html.body.section.find('div', 'relative').find('dl', 'secitem').find('div', 'subarea').find_all('a')
- texts = [t.get_text() for t in lv2Elements]
- f.write(' '.join(texts) + '\\r\\n\\r\\n')
- f.close()
- #该片段来自于http://www.codesnippet.cn/detail/200520133449.html
来源: http://www.codesnippet.cn/detail/200520133449.html