- '''
- 天涯帖子直播
- Python3.4.3
- '''
- import re,time,os
- import requests
- from bs4 import *
- header={'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8','Accept-Encoding':'gzip,deflate,sdch','Accept-Language':'zh-CN,zh;q=0.8','Cache-Control':'max-age=0','Connection':'keep-alive','Host':'bbs.tianya.cn','User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36'}
- refushtime=30 #默认刷新时间
- lastpage=1 #最新页面
- Furl=''#最新页面URL
- lzname=''
- #Turl='http://bbs.tianya.cn/post-stocks-1131734-1.shtml'#初始页面URL
- Turl='http://bbs.tianya.cn/post-stocks-1345750-1.shtml'
- def bsp(newurl):#bs4初始化
- html=requests.get(newurl,headers=header,timeout=10)
- #print(html)#正常显示中文
- soup=BeautifulSoup(html.content.decode())
- return soup
- def pagnum(soup):#获取总页数
- #print(soup.find('script',{'type':'text/javascript'}).text)
- tx=soup.find('script',{'type':'text/javascript'}).text
- req=re.search(r'pageCount : \\d*,',tx).group(0)
- req=req[12:-1]#截第12位后字符和倒数1字符之间
- #print(req)
- return int(req)
- def pagepro():#URL处理
- global Furl
- tx1=Turl.split('-')
- #['http://bbs.tianya.cn/post', 'stocks', '1131734', '1.shtml']
- Furl=tx1[0]+'-'+tx1[1]+'-'+tx1[2]+'-'+'%d.shtml'%lastpage
- #print(Furl)
- return Furl
- def pagecollect():#获取内容
- soup=bsp(Furl)
- #print(soup)
- txt=[]
- times=[]
- lzpost=soup.findAll('div',{'_host':lzname})
- #print(lzpost[0])
- if lzpost==[]:
- pass
- else:
- for i in range(len(lzpost)):
- ntime=lzpost[i].find('div',{'class':'atl-info'}).text #获取时间
- #times.append(ntime)
- #print(times)
- post=lzpost[i].find('div',{'class':'atl-content'})#一级搜索
- post=post.find('div',{'class':'bbs-content'}).text#二级搜索
- post=post.strip()
- #print(post)
- txt.append(ntime)
- txt.append(post)
- return txt
- def formatprint(txt):#输出处理
- if txt==[]:
- print('===========None============')
- else:
- for i in range(0,len(txt),2): #数组【时间,内容,时间……】
- print('='*30)
- print(txt[i])
- txt[i+1]=txt[i+1].replace('-'*29,'\\n----------\\n')
- print(txt[i+1])
- print('='*30)
- #time.sleep(0.5) #显示间隔
- pass
- def main():
- #Turl=input('>>')
- global refushtime,lastpage,Furl,Turl
- soup=bsp(Turl)
- title=re.sub('_.*','=====',soup.title.text)
- print('=====',title)
- lastpage=pagnum(soup)
- print('LastPage:',lastpage)
- Furl=pagepro() #合成最新URL
- print('LastURL:',Furl)
- lzname=soup.find('div',{'class':'atl-menu clearfix js-bbs-act'})['js_activityusername']
- print('Lzname:',lzname)
- formatprint(pagecollect()) #第一次输出
- while True:
- time.sleep(refushtime)
- soup=bsp(Turl) #刷新
- newtime=pagnum(soup)
- if newtime > lastpage:
- print('LastPage:',newtime)
- lastpage=newtime
- Furl=pagepro()
- formatprint(pagecollect())
- else:
- os.system('cls')
- formatprint(pagecollect())
- print('==========Refush==========')
- if __name__ == '__main__':
- soup=bsp(Turl)
- title=re.sub('_.*','=====',soup.title.text)
- print('=====',title)
- lastpage=pagnum(soup)
- print('LastPage:',lastpage)
- Furl=pagepro() #合成最新URL
- print('LastURL:',Furl)
- lzname=soup.find('div',{'class':'atl-menu clearfix js-bbs-act'})['js_activityusername']
- print('Lzname:',lzname)
- formatprint(pagecollect()) #第一次输出
- while True:
- time.sleep(refushtime)
- #input('Go to Refush')
- soup=bsp(Turl) #刷新
- newtime=pagnum(soup)
- if newtime > lastpage:
- print('LastPage:',newtime)
- lastpage=newtime
- Furl=pagepro()
- formatprint(pagecollect())
- else:
- os.system('cls')
- print('==========Refush==========')
- formatprint(pagecollect())
- #该片段来自于http://www.codesnippet.cn/detail/0605201512500.html
来源: http://www.codesnippet.cn/detail/0605201512500.html