爬虫: requests,beautifulsoup
词云: wordcloud,jieba
代码加注释:
- # -*- coding: utf-8 -*-
- import xlrd# 读取 Excel
- import xlwt# 写入 Excel
- import requests
- import linecache
- import wordcloud
- import jieba
- import matplotlib.pyplot as plt
- from bs4 import BeautifulSoup
- if __name__=="__main__":
- yun=""
- n=0#ID 编号
- target='https://api.bilibili.com/x/v1/dm/list.so?oid=132084205'#b 站 oid 页
- user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ApplewebKit/537.36 (Khtml, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
- headers = {'User-Agent':user_agent}# 伪装浏览器
- req=requests.get(url=target)
- HTML=req.text
- HTML=HTML.encode('ISO 8859-1')
- #HTML=HTML.replace('<br>','').replace('<br/>',' ').replace('/>','>')
- bf=BeautifulSoup(HTML,"html.parser")
- texts=bf.find('i')
- texts_div=texts.find_all('d')
- #print(texts_div)
- for item in texts_div:
- n=n+1
- item_name=item.text# 标题
- yun+=str(item_name)
- yun=yun.replace("","")
- yun=yun.replace("哈","")
- yun=yun.replace("啊","")
- yun=yun.replace("一","")# 去除无意义弹幕
- # 结巴分词, 生成字符串, wordcloud 无法直接生成正确的中文词云
- cut_text = " ".join(jieba.cut(yun))
- wc = wordcloud.WordCloud(
- #设置字体, 不然会出现口字乱码, 文字的路径是电脑的字体一般路径, 可以换成别的
- font_path="C:/Windows/Fonts/simfang.ttf",
- #设置了背景, 宽高
- background_color="white",width=1000,height=880).generate(cut_text)
- plt.imshow(wc, interpolation="bilinear")
- plt.axis("off")
- plt.show()
- print("Done!")
运行结果图:
来源: http://www.bubuko.com/infodetail-3320151.html