- #!/usr/bin/env python
- #coding=utf-8
- """
- Author: Anemone
- Filename: getmain.py
- Last modified: 2015-02-19 16:47
- E-mail: [email protected]
- """
- import urllib2
- from bs4 import BeautifulSoup
- import re
- import sys
- reload(sys)
- sys.setdefaultencoding('utf-8')
- def getEachArticle(url):
- # response = urllib2.urlopen('http://www.52duzhe.com/2015_01/duzh20150104.html')
- response = urllib2.urlopen(url)
- html = response.read()
- soup = BeautifulSoup(html)#.decode("utf-8").encode("gbk"))
- #for i in soup.find_all('div'):
- # print i,1
- title=soup.find("h1").string
- writer=soup.find(id="pub_date").string.strip()
- _from=soup.find(id="media_name").string.strip()
- text=soup.get_text()#.encode("utf-8")
- main=re.split("BAIDU_CLB.*;",text)
- result={"title":title,"writer":writer,"from":_from,"context":main[1]}
- return result
- #new=open("new.txt","w")
- #new.write(result["title"]+"\n\n")
- #new.write(result["writer"]+" "+result["from"])
- #new.write(result["context"])
- #new.close()
- def getCatalog(issue):
- url="http://www.52duzhe.com/"+issue[:4]+"_"+issue[-2:]+"/"
- firstUrl=url+"duzh"+issue+"01.html"
- firstUrl=url+"index.html"
- duzhe=dict()
- response = urllib2.urlopen(firstUrl)
- html = response.read()
- soup=BeautifulSoup(html)
- firstUrl=url+soup.table.a.get("href")
- response = urllib2.urlopen(firstUrl)
- html = response.read()
- soup = BeautifulSoup(html)
- all=soup.find_all("h2")
- for i in all:
- print i.string
- duzhe[i.string]=list()
- for link in i.parent.find_all("a"):
- href=url+link.get("href")
- print href
- while 1:
- try:
- article=getEachArticle(href)
- break
- except:
- continue
- duzhe[i.string].append(article)
- return duzhe
- def readDuZhe(duzhe):
- for eachColumn in duzhe:
- for eachArticle in duzhe[eachColumn]:
- print eachArticle["title"]
- if __name__ == '__main__':
- # issue=raw_input("issue(201501):")
- readDuZhe(getCatalog("201424"))
来源: http://www.phpxs.com/code/1009377/