- import urllib2
- import re
- import MySQLdb
- class LatestTest:
- #初始化
- def __init__(self):
- self.url="https://toutiao.io/latest"
- self.UserAgent='Mozilla/5.0 (Windows NT 6.1; WOW64) ApplewebKit/537.36 (Khtml, like Gecko) Chrome/48.0.2564.97 Safari/537.36'
- self.header={'User-Agent':self.UserAgent}
- #获取URL、标题、邮箱 保存到list
- def getDate(self):
- request=urllib2.Request(self.url,headers=self.header)
- respone=urllib2.urlopen(request).read()
- #print respone
- content=re.compile(r'<div class="post">.*?class="title">.*?href="(.*?)">(.*?)</a>.*?<div class="meta">.*?<span>(.*?)</span>',re.S)
- urls=re.findall(content,respone)
- namelist=[]
- for url in urls:
- #print url[0],url[1],url[2]
- namelist.append([url[0].strip(),url[1].strip(),url[2].strip()])
- if len(namelist)>=10:
- break
- return namelist
- #保存数据到mysql数据库
- def savaDateMysql(self,url,title,email):
- sql="insert into content(url,title,email)values('%s','%s','%s')" %(url,title,email)
- try:
- conn=MySQLdb.connect('192.168.200.23','root','g6s8m3t7s','mysql',charset='utf8')
- cursor=conn.cursor()
- # cursor.execute('create table IF NOT EXISTS content(id int AUTO_INCREMENT PRIMARY KEY,url varchar(100),title varchar(100),email varchar(100))')
- #cursor.execute('drop table IF EXISTS content')
- cursor.execute(sql)
- conn.commit()
- except Exception,e:
- print e
- finally:
- conn.close()
- if __name__=='__main__':
- lat=LatestTest()
- contentlist=lat.getDate()
- try:
- for tent in contentlist:
- url=tent[0].strip()
- title=tent[1].strip()
- email=tent[2].strip()
- print url,title,email
- lat.savaDateMysql(url,title,email)
- except Exception,e:
- print e
来源: