Python 爬虫 b 站视频弹幕并生成词云图分析

爬虫: requests,beautifulsoup

词云: wordcloud,jieba

代码加注释:

# -*- coding: utf-8 -*-
 import xlrd# 读取 Excel
 import xlwt# 写入 Excel
 import requests
 import linecache
 import wordcloud
 import jieba
 import matplotlib.pyplot as plt
 from bs4 import BeautifulSoup
 if __name__=="__main__":
     yun=""
     n=0#ID 编号
     target='https://api.bilibili.com/x/v1/dm/list.so?oid=132084205'#b 站 oid 页
     user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ApplewebKit/537.36 (Khtml, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
     headers = {'User-Agent':user_agent}# 伪装浏览器
     req=requests.get(url=target)
     HTML=req.text
     HTML=HTML.encode('ISO 8859-1')
     #HTML=HTML.replace('<br>','').replace('<br/>',' ').replace('/>','>')
     bf=BeautifulSoup(HTML,"html.parser")
     texts=bf.find('i')
     texts_div=texts.find_all('d')
     #print(texts_div)
     for item in texts_div:
         n=n+1
         item_name=item.text# 标题
         yun+=str(item_name)
     yun=yun.replace("","")
     yun=yun.replace("哈","")
     yun=yun.replace("啊","")
     yun=yun.replace("一","")# 去除无意义弹幕
     # 结巴分词, 生成字符串, wordcloud 无法直接生成正确的中文词云
     cut_text = " ".join(jieba.cut(yun))
     wc = wordcloud.WordCloud(
     #设置字体, 不然会出现口字乱码, 文字的路径是电脑的字体一般路径, 可以换成别的
     font_path="C:/Windows/Fonts/simfang.ttf",
     #设置了背景, 宽高
     background_color="white",width=1000,height=880).generate(cut_text)
     plt.imshow(wc, interpolation="bilinear")
     plt.axis("off")
     plt.show()
     print("Done!")

运行结果图:

来源: http://www.bubuko.com/infodetail-3320151.html

与本文相关文章

暂无,快来抢沙发吧！