- #! /usr/bin/env python
- #coding=utf-8
- import urllib
- from bs4 import BeautifulSoup
- import socket
- import time
- import MySQLdb
- import sys
- import jieba,jieba.analyse
- reload(sys)
- sys.setdefaultencoding('utf-8')
- timeout = 10
- socket.setdefaulttimeout(timeout)
- sleep_download_time = 20
- time.sleep(sleep_download_time)
- print 'Good! Start to url Parse! Please wait----------------------'
- url_caijing=[];
- html_caijing=[];
- templist_caijing=[] #初始化
- url_caijing0 = 'http://stock.caijing.com.cn/gpdp/index.html' # 大盘
- html_caijing0 = BeautifulSoup(urllib.urlopen(url_caijing0).read(),from_encoding="gb18030")
- templist_caijing0 = html_caijing0.findAll("ul",{"class":"list"})
- for i in templist_caijing0:
- newslist_caijing0=i.findAll("a")
- s = len(newslist_caijing0)
- print '加载了url_caijing0',s,',条记录进入-----------------------'
- url_caijing1 = 'http://stock.caijing.com.cn/stockresearch/' #策略研究
- html_caijing1 = BeautifulSoup(urllib.urlopen(url_caijing1).read(),from_encoding="gb18030")
- templist_caijing1 = html_caijing1.findAll("ul",{"class":"list"})
- for i in templist_caijing1:
- newslist_caijing1=i.findAll("a")
- s = len(newslist_caijing1)
- print '加载了url_caijing1',s,',条记录进入-----------------------'
- url_caijing2 = 'http://stock.caijing.com.cn/market/' #股票市场
- html_caijing2 = BeautifulSoup(urllib.urlopen(url_caijing2).read(),from_encoding="gb18030")
- templist_caijing2 = html_caijing2.findAll("ul",{"class":"list"})
- for i in templist_caijing2:
- newslist_caijing2 = i.findAll("a")
- s = len(newslist_caijing2)
- print '加载了url_caijing2',s,',条记录进入-----------------------'
- url_caijing3 = 'http://industry.caijing.com.cn/industrianews/' #产经要闻
- html_caijing3 = BeautifulSoup(urllib.urlopen(url_caijing3).read(),from_encoding="gb18030")
- templist_caijing3 = html_caijing3.findAll("ul",{"class":"list"})
- for i in templist_caijing3:
- newslist_caijing3 = i.findAll("a")
- s = len(newslist_caijing3)
- print '加载了url_caijing3',s,',条记录进入-----------------------'
- url_caijing4 = 'http://economy.caijing.com.cn/economynews/' #宏观
- html_caijing4 = BeautifulSoup(urllib.urlopen(url_caijing4).read(),from_encoding="gb18030")
- templist_caijing4 = html_caijing4.findAll("ul",{"class":"list"})
- for i in templist_caijing4:
- newslist_caijing4 = i.findAll("a")
- s = len(newslist_caijing4)
- print '加载了url_caijing4',s,',条记录进入-----------------------'
- print '等待url 解析,waiting----------------------------------'
- newslist_caijing = newslist_caijing0 + newslist_caijing1 + newslist_caijing2 + newslist_caijing3 + newslist_caijing4
- hlink_caijing = []
- title_caijing = []
- for i in range(len(newslist_caijing)):
- hlink_caijing.append(newslist_caijing[i]['href'])
- title_caijing.append(newslist_caijing[i].get_text())
- temp_caijing=[]
- for i in hlink_caijing :
- temp_caijing.append(BeautifulSoup(urllib.urlopen(i).read(),from_encoding="gb18030").find('div',{"id":"the_content"}).findAll("p"))
- tcaijing=[]
- for i in range(len(temp_caijing)):
- tcaijing.append([])
- t = temp_caijing[i]
- for j in t:
- tcaijing[i].append(j.get_text())
- contents_caijing=[]
- for i in tcaijing:
- contents_caijing.append('\\\\n'.join(i))
- keyword=[]
- for i in contents_caijing:
- keyword.append(",".join(jieba.analyse.extract_tags(i,topK=10)))
- gettime = []
- for i in range(len(hlink_caijing)):
- gettime.append((time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))))
- news_caijing=[]
- for i in range(len(hlink_caijing)):
- news_caijing.append(('财经','财经网',hlink_caijing[i],title_caijing[i].encode('utf-8'),keyword[i].encode('utf-8'),contents_caijing[i].encode('utf-8'),gettime[i]))
- conn = MySQLdb.connect(host="localhost",user="root",passwd="123456",charset="UTF8")
- conn.select_db('test')
- cur=conn.cursor()
- cur.executemany("""replace into getnewslist values(%s,%s,%s,%s,%s,%s,%s) """,news_caijing)
- conn.commit()
- conn.close
- #该片段来自于http://www.codesnippet.cn/detail/250920136102.html
来源: http://www.codesnippet.cn/detail/250920136102.html