- #!/usr/bin/python
- #encoding='utf-8'
- #runing under python 3.x
- import urllib.request
- from bs4 import BeautifulSoup
- import re
- def removeTags(content):#谁起的这个名字
- strs = re.split("<style>.*?</style>" \\
- "|<script.*?>.*?</script>" \\
- "|&#[0-9]+;"
- "|<!--\\[if !IE\\]>.+?<!\\[endif\\]-->" \\
- "|<.*?>" \\
- "|strong#.*?no-repeat}" \\
- , content)#各种匹配,通过“|”分隔
- ans = ''
- #将切分的结果组合起来
- for each in strs:
- ans += each
- return ans
- #Step1
- page = urllib.request.urlopen("http://news.qq.com/")
- if page.getcode() != 200:
- exit()
- mybytes = page.read()
- page.close()
- html = mybytes.decode("gbk")
- soup = BeautifulSoup(html)
- exp = r'http://news.qq.com/a/.*'#这个是用来匹配符合条件的链接,使用正则表达式匹配
- newsListPattern = re.compile(exp)
- links = soup.findAll(href = newsListPattern)
- expRemoveTag = r'<script>.*?</script>'
- for link in links:
- #print(link['href']) 链接
- page = urllib.request.urlopen(link['href'])
- if page.getcode() != 200:
- exit()
- mybytes = page.read()
- page.close()
- newshtml = mybytes.decode("gbk")
- soup = BeautifulSoup(newshtml)
- newsTitleTag = soup.find('h1') #新闻标题节点
- newsTitle = newsTitleTag.contents[0]; #新闻标题内容
- print(newsTitle)
- #图片内容
- newContent = []
- newsContentTag = soup.find(id='Cnt-Main-Article-QQ')
- if newsContentTag != None:
- newsImgTopTag = newsContentTag.find('p') #图片顶层节点
- if newsImgTopTag != None:
- newImgTag = newsImgTopTag.find("img")
- if newImgTag != None:
- imgLink = newImgTag['src'] #图片链接
- imgAlt = newImgTag['alt'] #图片文字说明
- print(imgLink)
- print(imgAlt)
- #获取文本信息
- nextNewsContentTags = newsImgTopTag.find_next_siblings()
- newContent.clear();
- for newsContentTag in nextNewsContentTags:
- if len(newsContentTag.contents) > 0:
- newContent.append(newsContentTag.contents[0])
- #打印单条新闻
- if newContent != None:
- strs = ''
- for content in newContent:
- if content.string != None:
- strs += content.string
- strs = removeTags(strs)
- print(strs)
- #该片段来自于http://www.codesnippet.cn/detail/210520149636.html
来源: http://www.codesnippet.cn/detail/210520149636.html