新闻采集

 
#!/usr/bin/python 
#encoding='utf-8'
#runing under python 3.x
import urllib.request
from bs4 import BeautifulSoup
import re
 
def removeTags(content):#谁起的这个名字
    strs = re.split("<style>.*?</style>" \\
                    "|<script.*?>.*?</script>" \\
                    "|&#[0-9]+;"
                    "|<!--\\[if !IE\\]>.+?<!\\[endif\\]-->" \\
                    "|<.*?>" \\
                    "|strong#.*?no-repeat}" \\
                    , content)#各种匹配，通过“|”分隔
    ans = ''
    #将切分的结果组合起来
    for each in strs:
        ans += each
    return ans
 
#Step1
page = urllib.request.urlopen("http://news.qq.com/")
if page.getcode() != 200:
    exit()
mybytes = page.read()
page.close()
html = mybytes.decode("gbk")
soup = BeautifulSoup(html)
exp = r'http://news.qq.com/a/.*'#这个是用来匹配符合条件的链接，使用正则表达式匹配
newsListPattern = re.compile(exp)
links = soup.findAll(href = newsListPattern)
 
expRemoveTag = r'<script>.*?</script>'
 
for link in links:
    #print(link['href'])    链接
    page = urllib.request.urlopen(link['href'])
    if page.getcode() != 200:
        exit()
    mybytes = page.read()
    page.close()
    newshtml = mybytes.decode("gbk")
    soup = BeautifulSoup(newshtml)
    newsTitleTag = soup.find('h1')  #新闻标题节点
    newsTitle = newsTitleTag.contents[0];   #新闻标题内容
 
    print(newsTitle)
    #图片内容
    newContent = []
    newsContentTag = soup.find(id='Cnt-Main-Article-QQ') 
    if newsContentTag != None:
        newsImgTopTag = newsContentTag.find('p')   #图片顶层节点
        if newsImgTopTag != None:
            newImgTag = newsImgTopTag.find("img")
            if newImgTag != None:
                imgLink = newImgTag['src']                  #图片链接
                imgAlt = newImgTag['alt']                   #图片文字说明
                print(imgLink)
                print(imgAlt)
     
            #获取文本信息
            nextNewsContentTags = newsImgTopTag.find_next_siblings()
            newContent.clear();
            for newsContentTag in nextNewsContentTags:
                if len(newsContentTag.contents) > 0:
                    newContent.append(newsContentTag.contents[0])
 
    #打印单条新闻
    if newContent != None:
        strs = ''
        for content in newContent:
            if content.string != None:
                strs += content.string
        strs = removeTags(strs)
        print(strs)
#该片段来自于http://www.codesnippet.cn/detail/210520149636.html
来源: http://www.codesnippet.cn/detail/210520149636.html
与本文相关文章

暂无,快来抢沙发吧！