网易云音乐评论 可视化分析
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # @Time : 2017/3/29 9:07
4 # @Author : Lyrichu
5 # @Email : 919987476@qq.com
6 # @File : NetCloud_comments_plot.py
7 '''
8 @Description:
9 对抓取来的网易云评论数据进行简单的可视化分析
10 '''
11 from NetCloud_spider3
import NetCloudCrawl
12 import requests
13 import matplotlib.dates as mdates
14 from pylab
import *
15 mpl.rcParams[
'font.sans-serif'] = [
'SimHei']
# 防止无法显示中文
16 import matplotlib.pyplot as plt
17 from datetime
import datetime
18 import re
19 import time
20 import pandas as pd
21 import codecs
22 import jieba
23 from wordcloud
import WordCloud
24 from scipy.misc
import imread
25 from os
import path
26 import os
27
28
29 class NetCloudProcessor(NetCloudCrawl):
30 # 读取评论文本数据,返回一个列表,列表的每个元素为一个字典,字典中包含用户id,评论内容等
31 def read_comments_file(self,filename):
32 list_comments = []
# 评论数据列表
33 with open(filename,
'r') as f:
34 comments_list = f.readlines()
# 读取文本,按行读取,返回列表
35 del comments_list[0]
# 删除首个元素
36 comments_list = list(set(comments_list))
# 去除重复数据
37 count_ = -1
# 记录评论数
38 for comment
in comments_list:
39 comment = comment.replace(
"\n",
"")
# 去除末尾的换行符
40 try:
41 if (re.search(re.compile(r
'^\d+?'),comment)):
# 如果以数字开头
42 comment_split = comment.split(
' ',5)
# 以空格分割(默认)
43 comment_dict =
{}
44 comment_dict[
'userID'] = comment_split[0]
# 用户ID
45 comment_dict[
'nickname'] = comment_split[1]
# 用户昵称
46 comment_dict[
'avatarUrl'] = comment_split[2]
# 用户头像地址
47 comment_dict[
'comment_time'] = int(comment_split[3])
# 评论时间
48 comment_dict[
'likedCount'] = int(comment_split[4])
# 点赞总数
49 comment_dict[
'comment_content'] = comment_split[5]
# 评论内容
50 list_comments.append(comment_dict)
51 count_ += 1
52 else:
53 list_comments[count_][
'comment_content'] += comment
# 将评论追加到上一个字典
54 except Exception,e:
55 print(e)
56 list_comments.sort(key=
lambda x:x[
'comment_time'])
57 print(u
"去除重复之后有%d条评论!" % (count_+1
))
58 return (count_+1,list_comments)
# 返回评论总数以及处理完的评论内容
59
60 # 将网易云的时间戳转换为年-月-日的日期函数
61 # 时间戳需要先除以1000才能得到真实的时间戳
62 # format 为要转换的日期格式
63 def from_timestamp_to_date(self,time_stamp,format):
64 time_stamp = time_stamp*0.001
65 real_date =
time.strftime(format,time.localtime(time_stamp))
66 return real_date
67
68
69 # 统计相关数据写入文本文件
70 def count_comments_info(self,comments_list,count_,song_name):
71 x_date_Ym = []
# 评论数按年月进行统计
72 x_date_Ymd = []
# 评论数按年月日进行统计
73 x_likedCount = []
# 点赞总数分布
74 for i
in range(count_):
75 time_stamp = comments_list[i][
'comment_time']
# 时间戳
76 real_date_Ym = self.from_timestamp_to_date(time_stamp,
'%Y-%m')
# 按年月进行统计
77 real_date_Ymd = self.from_timestamp_to_date(time_stamp,
'%Y-%m-%d')
# 按年月日统计
78 likedCount = comments_list[i][
'likedCount']
# 点赞总数
79 x_date_Ym.append(real_date_Ym)
80 x_date_Ymd.append(real_date_Ymd)
81 x_likedCount.append(likedCount)
82 x_date_Ym_no_repeat =
[]
83 y_date_Ym_count =
[]
84 x_date_Ymd_no_repeat =
[]
85 y_date_Ymd_count =
[]
86 x_likedCount_no_repeat =
[]
87 y_likedCount_count =
[]
88 # 年月
89 for date_
in x_date_Ym:
90 if date_
not in x_date_Ym_no_repeat:
91 x_date_Ym_no_repeat.append(date_)
92 y_date_Ym_count.append(x_date_Ym.count(date_))
93 # 年月日
94 for date_
in x_date_Ymd:
95 if date_
not in x_date_Ymd_no_repeat:
96 x_date_Ymd_no_repeat.append(date_)
97 y_date_Ymd_count.append(x_date_Ymd.count(date_))
98
99 for likedCount
in x_likedCount:
100 if likedCount
not in x_likedCount_no_repeat:
101 x_likedCount_no_repeat.append(likedCount)
102 y_likedCount_count.append(x_likedCount.count(likedCount))
103 # 将统计的数据存入txt文件
104 with open(u
"%s/comments_num_by_Ym.txt" % song_name,
"w") as f:
105 f.write(
"date_Ym comments_num\n")
106 for index,date_Ym
in enumerate(x_date_Ym_no_repeat):
107 f.write(x_date_Ym_no_repeat[index] +
" " + str(y_date_Ym_count[index]) +
"\n")
108 print(u
"成功写入comments_num_by_Ym.txt!")
109 with open(u
"%s/comments_num_by_Ymd.txt" % song_name,
"w") as f:
110 f.write(
"date_Ymd comments_num\n")
111 for index,date_Ymd
in enumerate(x_date_Ymd_no_repeat):
112 f.write(x_date_Ymd_no_repeat[index] +
" " + str(y_date_Ymd_count[index]) +
"\n")
113 print(u
"成功写入comments_num_by_Ymd.txt!")
114 with open(u
"%s/likedCount.txt" % song_name,
"w") as f:
115 f.write(
"likedCount count_num\n")
116 for index,likedCount
in enumerate(x_likedCount_no_repeat):
117 f.write(str(x_likedCount_no_repeat[index]) +
" " + str(y_likedCount_count[index]) +
"\n")
118 print(u
"成功写入likedCount.txt!")
119 # 得到处理过的x_date 和 count 统计信息
120 def get_xdate_ycount(self,count_file_name,date_type,min_date_Ym,max_date_Ym,min_date_Ymd,max_date_Ymd):
121 with open(count_file_name,
'r') as f:
122 list_count =
f.readlines()
123 # comment_or_like = list_count[0].replace("\n","").split(" ")[1] # 判断是评论数还是点赞数
124 # song_name = count_file_name.split("/")[0] # 歌曲名字
125 del list_count[0]
126 x_date =
[]
127 y_count =
[]
128 for content
in list_count:
129 content.replace(
"\n",
"")
130 res = content.split(
' ')
131 if(date_type ==
'%Y-%m-%d'):
132 if(int(
"".join(res[0].split(
"-"))) >= int(
"".join(min_date_Ymd.split(
"-")))
and int(
"".join(res[0].split(
"-"))) <= int(
"".join(max_date_Ymd.split(
"-")))):
133 x_date.append(res[0])
134 y_count.append(int(res[1
]))
135 else:
136 if(int(
"".join(res[0].split(
"-"))) >= int(
"".join(min_date_Ym.split(
"-")))
and int(
"".join(res[0].split(
"-"))) <= int(
"".join(max_date_Ym.split(
"-")))):
137 x_date.append(res[0])
138 y_count.append(int(res[1
]))
139 return (x_date,y_count)
140
141
142 # 绘制图形展示歌曲评论以及点赞分布
143 # plot_type:为 'plot' 绘制散点图 为 'bar' 绘制条形图
144 # date_type 为日期类型
145 # time_distance 为时间间隔(必填)例如:5D 表示5天,1M 表示一个月
146 # min_liked_num 为绘图时的最小点赞数
147 # max_liked_num 为绘图时的最大点赞数
148 # min_date_Ym 为最小日期(年-月形式)
149 # max_date_Ym 为最大日期(年-月形式)
150 # min_date_Ymd 为最小日期(年-月-日形式)
151 # max_date_Ymd 为最大日期(年-月-日形式)
152 def plot_comments(self,song_name,settings):
153 comment_type = settings[
'comment_type']
154 date_type = settings[
'date_type']
155 plot_type = settings[
'plot_type']
156 bar_width = settings[
'bar_width']
157 rotation = settings[
'rotation']
158 time_distance = settings[
'time_distance']
159 min_date_Ymd = settings[
'min_date_Ymd']
160 max_date_Ymd = settings[
'max_date_Ymd']
161 min_date_Ym = settings[
'min_date_Ym']
162 max_date_Ym = settings[
'max_date_Ym']
163 if(comment_type):
# 评论
164 if(date_type ==
'%Y-%m-%d'):
165 count_file_name = u
"%s/comments_num_by_Ymd.txt" %
song_name
166 else:
167 count_file_name = u
"%s/comments_num_by_Ym.txt" %
song_name
168 else:
169 count_file_name = u
"%s/likedCount.txt" %
song_name
170 with open(count_file_name,
'r') as f:
171 list_count =
f.readlines()
172 del list_count[0]
173 if(comment_type):
# 如果是评论
174 x_date =
[]
175 y_count =
[]
176 for content
in list_count:
177 content.replace(
"\n",
"")
178 res = content.split(
' ')
179 if(date_type ==
'%Y-%m-%d'):
180 if(int(
"".join(res[0].split(
"-"))) >= int(
"".join(min_date_Ymd.split(
"-")))
and int(
"".join(res[0].split(
"-"))) <= int(
"".join(max_date_Ymd.split(
"-")))):
181 x_date.append(res[0])
182 y_count.append(int(res[1
]))
183 else:
184 if(int(
"".join(res[0].split(
"-"))) >= int(
"".join(min_date_Ym.split(
"-")))
and int(
"".join(res[0].split(
"-"))) <= int(
"".join(max_date_Ym.split(
"-")))):
185 x_date.append(res[0])
186 y_count.append(int(res[1
]))
187 else:
# 如果是点赞
188 # 分为10-100,100-1000,1000-10000,10000以上这5个区间,由于绝大多数歌曲评论点赞数都在10赞一下
189 # 超过99%,所以10赞以下暂时忽略
190 x_labels = [u
'10-100',u
'100-1000',u
'1000-10000',u
'10000以上']
191 y_count =
[0,0,0,0]
192 for content
in list_count:
193 content.replace(
"\n",
"")
194 res = content.split(
' ')
195 if(int(res[0]) <= 100
and int(res[0]) >= 10
):
196 y_count[0] += int(res[1
])
197 elif(int(res[0]) <= 1000
):
198 y_count[1] += int(res[1
])
199 elif(int(res[0]) <= 10000
):
200 y_count[2] += int(res[1
])
201 else:
202 y_count[3] += int(res[1
])
203 # 如果是评论
204 if(comment_type):
205 type_text = u
"评论"
206 x = [datetime.strptime(d, date_type).date()
for d
in x_date]
207 # 配置横坐标为日期类型
208 plt.gca().xaxis.set_major_formatter(mdates.DateFormatter(
'%s' %
date_type))
209 if(date_type ==
'%Y-%m-%d'):
210 plt.gca().xaxis.set_major_locator(mdates.DayLocator())
211 else:
212 plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
213 if(plot_type ==
'plot'):
214 plt.plot(x,y_count,color = settings[
'color'])
215 elif(plot_type ==
'bar'):
216 plt.bar(x,y_count,width=bar_width,color = settings[
'color'])
217 else:
218 plt.scatter(x,y_count,color = settings[
'color'])
219 plt.gcf().autofmt_xdate(rotation=rotation)
# 自动旋转日期标记
220 plt.title(u
"网易云音乐歌曲《" + song_name + u
"》" + type_text + u
"数目分布")
221 plt.xlabel(u
"日期")
222 plt.ylabel(u
"数目")
223 plt.xticks(pd.date_range(x[0],x[-1],freq=
"%s" % time_distance))
# 设置日期间隔
224 plt.show()
225 else:
# 如果是点赞
226 x =
y_count
227 type_text = u
"点赞"
228 pie_colors = settings[
'pie_colors']
229 auto_pct = settings[
'auto_pct']
# 百分比保留几位小数
230 expl = settings[
'expl']
# 每块距离圆心的距离
231 plt.pie(x,labels = x_labels,explode=expl,colors = pie_colors,autopct =
auto_pct)
232 plt.title(u
"网易云音乐歌曲《" + song_name + u
"》" + type_text + u
"数目分布")
233 plt.legend(x_labels)
234 plt.show()
235 plt.close()
236
237
238 # 生成某个歌曲的统计信息文件
239 def generate_count_info_files(self,song_name):
240 filename =
"%s/%s.txt" %
(song_name,song_name)
241 count_,list_comments =
self.read_comments_file(filename)
242 print(u
"%s有%d条评论!" %
(song_name,count_))
243 self.count_comments_info(list_comments,count_,song_name)
244
245 # 一步完成数据抓取,生成统计信息文件的工作
246 def create_all_necessary_files(self,song_id,song_name):
247 start_time =
time.time()
248 # 数据抓取并写入文件
249 self.save_all_comments_to_file(song_id,song_name)
250 # 生成热门评论文件
251 url =
"http://music.163.com/weapi/v1/resource/comments/R_SO_4_%d/?csrf_token=" %
song_id
252 hot_comments_list =
self.get_hot_comments(url)
253 self.save_to_file(hot_comments_list,u
"%s/hotcomments.txt" %
song_name)
254 # 生成统计信息文件(3个)
255 self.generate_count_info_files(song_name)
256 # 生成所有评论者信息文件
257 self.save_commenters_info_to_file(song_name)
258 # 生成 评论词云(全部评论)
259 self.draw_wordcloud(song_name,singer_name=
False)
260 end_time =
time.time()
261 print(u
"任务完成!程序耗时%f秒!" %(end_time -
start_time))
262 # 得到某首歌曲下所有评论者(需要去除重复)的主页信息
263 def get_commenters_info(self,filename):
264 commenters_info_list = []
# 存放评论用户信息
265 with codecs.open(filename,
"r",encoding=
'utf-8') as f:
266 lists =
f.readlines()
267 del lists[0]
# 删除第一行
268 commenters_urls_list = []
# 评论者列表
269 for info
in lists:
270 if(re.match(r
'^\d.*?',info)):
271 commenters_urls_list.append(u
"http://music.163.com/user/home?id=" + info.split(
" ")[0])
# 评论者主页地址
272 commenters_urls_list = list(set(commenters_urls_list))
# 去除重复的人
273 print(
"共有%d个不同评论者!" %
len(commenters_urls_list))
274 for index,url
in enumerate(commenters_urls_list):
275 try:
276 info_dict = {}
# 评论用户个人信息字典
277 user_id_compile = re.compile(r
'.*id=(\d+)')
278 user_id = re.search(user_id_compile,url).group(1
)
279 html = requests.get(url,headers =
self.headers).text
280 event_count_compile = re.compile(r
'<strong id="event_count">(\d+?)</strong>')
281 event_count = re.search(event_count_compile,html).group(1)
# 个人动态数目
282 follow_count_compile = re.compile(r
'<strong id="follow_count">(\d+?)</strong>')
283 follow_count = re.search(follow_count_compile,html).group(1)
# 关注人数
284 fan_count_compile = re.compile(r
'<strong id="fan_count">(\d+?)</strong>')
285 fan_count = re.search(fan_count_compile,html).group(1
)
286 location_compile = re.compile(u
'<span>所在地区:(.+?)</span>')
# 注意需要使用unicode编码,正则表达式才能匹配
287 location_res =
re.search(location_compile,html)
288 if(location_res):
289 location = location_res.group(1
)
290 else:
291 location = u
"未知地区"
292 self_description_compile = re.compile(u
'<div class="inf s-fc3 f-brk">个人介绍:(.*?)</div>')
293 if(re.search(self_description_compile,html)):
# 如果可以匹配到
294 self_description = re.search(self_description_compile,html).group(1
)
295 else:
296 self_description = u
"未知个人介绍"
297 age_compile = re.compile(r
'<span.*?data-age="(\d+)">')
298 if(re.search(age_compile,html)):
299 age_time = re.search(age_compile,html).group(1)
# 这个得到的是出生日期距离unix时间戳起点的距离
300 # 需要将其转换为年龄
301 age = (2017-1970) - (int(age_time)/(1000*365*24*3600))
# 真实的年龄
302 else:
303 age = u
"未知年龄"
304 listening_songs_num_compile = re.compile(u
'<h4>累积听歌(\d+?)首</h4>')
305 if(re.search(listening_songs_num_compile,html)):
306 listening_songs_num = re.search(listening_songs_num_compile,html).group(1)
# 听歌总数
307 else:
308 listening_songs_num = u
'未知听歌总数'
309 info_dict[
'user_id'] =
user_id
310 info_dict[
'event_count'] = event_count
# 动态总数
311 info_dict[
'follow_count'] = follow_count
# 关注总数
312 info_dict[
'fan_count'] = fan_count
# 粉丝总数
313 info_dict[
'location'] = location
# 所在地区
314 info_dict[
'self_description'] = self_description
# 个人介绍
315 info_dict[
'age'] = age
# 年龄
316 info_dict[
'listening_songs_num'] = listening_songs_num
# 累计听歌总数
317 commenters_info_list.append(info_dict)
318 print(
"成功添加%d个用户信息!" % (index+1
))
319 except Exception,e:
320 print e
321 return commenters_info_list
# 返回评论者用户信息列表
322
323 # 保存评论者的信息
324 def save_commenters_info_to_file(self,song_or_singer_name):
325 if(os.path.exists(u
"%s/%s.txt" %
(song_or_singer_name,song_or_singer_name))):
326 filename = u
"%s/%s.txt" %
(song_or_singer_name,song_or_singer_name)
327 else:
328 filename = u
"%s/hotcomments.txt" %
song_or_singer_name
329 commenters_info_lists = self.get_commenters_info(filename)
# 得到用户信息列表
330 with codecs.open(u
"%s/commenters_info.txt" % song_or_singer_name,
"w",encoding=
'utf-8') as f:
331 f.write(u
"用户ID 动态总数 关注总数 粉丝总数 所在地区 个人介绍 年龄 累计听歌总数\n")
332 for info
in commenters_info_lists:
333 user_id = info[
'user_id']
# 用户id
334 event_count = info[
'event_count']
# 动态数目
335 follow_count = info[
'follow_count']
# 关注的人数
336 fan_count = info[
'fan_count']
# 粉丝数
337 location = info[
'location']
# 所在地区
338 self_description = info[
'self_description']
# 个人介绍
339 age = unicode(info[
'age'])
# 年龄
340 listening_songs_num = info[
'listening_songs_num']
# 累计听歌总数
341 full_info = unicode(user_id) + u
" " + event_count + u
" " + follow_count + u
" " + fan_count + u
" " + location + u
" " + self_description + u
" " + age + u
" " + listening_songs_num + u
"\n"
342 f.write(full_info)
343 print(u
"成功写入文件%s/commenters_info.txt" %
song_or_singer_name)
344
345 # 得到某个歌手全部热门歌曲id列表
346 def get_songs_ids(self,singer_url):
347 ids_list =
[]
348 html = requests.get(singer_url,headers = self.headers,proxies =
self.proxies).text
349 re_pattern = re.compile(r
'<a href="/song\?id=(\d+?)">.*?</a>')
350 ids =
re.findall(re_pattern,html)
351 for id
in ids:
352 ids_list.append(id)
353 return ids_list
354 # 得到某个歌手所有歌曲的热门评论
355 def get_singer_all_hot_comments(self,singer_name,singer_id):
356 singer_url =
'http://music.163.com/artist?id=%d' %
singer_id
357 song_ids = self.get_songs_ids(singer_url)
# 得到歌手所有热门歌曲id列表
358 for song_id
in song_ids:
359 url =
"http://music.163.com/weapi/v1/resource/comments/R_SO_4_%d/?csrf_token=" %
int(song_id)
360 hot_comments_list =
self.get_hot_comments(url)
361 if(os.path.exists(singer_name)):
362 self.save_to_file(hot_comments_list,u
"%s/hotcomments.txt" %
singer_name)
363 else:
364 os.mkdir(singer_name)
365 self.save_to_file(hot_comments_list,u
"%s/hotcomments.txt" %
singer_name)
366 print(u
"成功写入%s的%d首歌曲!" %
(singer_name,len(song_ids)))
367
368 # 在一张图中绘制多个歌曲的评论分布
369 # song_names_list 为多个歌曲名字的列表
370 # settings 为含有字典元素的列表,每个字典含有每个子图的配置项
371 def sub_plot_comments(self,song_names_list,settings,row,col):
372 n = len(song_names_list)
# 歌曲总数
373 row =
row
374 col =
col
375 for i
in range(n):
376 plt.subplot(row,col,i+1
)
377 if(settings[i][
'date_type'] ==
'%Y-%m-%d'):
378 count_file_name = u
"%s/comments_num_by_Ymd.txt" %
song_names_list[i]
379 else:
380 count_file_name = u
"%s/comments_num_by_Ym.txt" %
song_names_list[i]
381 date_type = settings[i][
'date_type']
382 min_date_Ym = settings[i][
'min_date_Ym']
383 max_date_Ym = settings[i][
'max_date_Ym']
384 min_date_Ymd = settings[i][
'min_date_Ymd']
385 max_date_Ymd = settings[i][
'max_date_Ymd']
386 x_date,y_count = self.get_xdate_ycount(count_file_name,min_date_Ym = min_date_Ym,max_date_Ym =
max_date_Ym,
387 min_date_Ymd = min_date_Ymd,max_date_Ymd = max_date_Ymd,date_type =
date_type)
388
389 x = [datetime.strptime(d, date_type).date()
for d
in x_date]
390 # 配置横坐标为日期类型
391 plt.gca().xaxis.set_major_formatter(mdates.DateFormatter(
'%s' %
date_type))
392 if(date_type ==
'%Y-%m-%d'):
393 plt.gca().xaxis.set_major_locator(mdates.DayLocator())
394 else:
395 plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
396 plot_type = settings[i][
'plot_type']
397 if(plot_type ==
'plot'):
398 plt.plot(x,y_count,color = settings[i][
'color'])
399 elif(plot_type ==
'bar'):
400 plt.bar(x,y_count,width=settings[i][
'bar_width'],color = settings[i][
'color'])
401 else:
402 plt.scatter(x,y_count,color = settings[i][
'color'])
403 plt.gcf().autofmt_xdate(rotation=settings[i][
'rotation'])
# 自动旋转日期标记
404 plt.title(u
"网易云音乐歌曲《" + song_names_list[i] + u
"》" + u
"评论数目分布(%s到%s)" %(x[0],x[-1]),fontsize = settings[i][
'fontsize'])
405 plt.xlabel(u
"日期")
406 plt.ylabel(u
"数目")
407 plt.xticks(pd.date_range(x[0],x[-1],freq=
"%s" % settings[i][
'time_distance']))
# 设置日期间隔
408 plt.subplots_adjust(left=0.2, bottom=0.2, right=0.8, top=0.8,hspace=1.2,wspace=0.3
)
409 plt.show()
410 # 得到评论列表
411 def get_comments_list(self,filename):
412 with codecs.open(filename,
"r",encoding=
'utf-8') as f:
413 lists =
f.readlines()
414 comments_list =
[]
415 for comment
in lists:
416 if(re.match(r
"^\d.*",comment)):
417 try:
418 comments_list.append(comment.split(
" ",5)[5].replace(
"\n",
""))
419 except Exception,e:
420 print(e)
421 else:
422 comments_list.append(comment)
423 return comments_list
424
425 # 绘制词云
426 # pic_path 为词云背景图片地址
427 # singer_name 为 False 时,则读取歌曲评论文件,否则读取歌手热评文件
428 # isFullComments = True 时,读取全部评论,否则只读取热评
429 def draw_wordcloud(self,song_name,singer_name,pic_path =
"JayChou.jpg",isFullComments =
True):
430 if singer_name ==
False:
431 if isFullComments ==
True:
432 filename = u
"%s/%s.txt" % (song_name,song_name)
# 全部评论
433 else:
434 filename = u
"%s/hotcomments.txt" % song_name
# 一首歌的热评
435 else:
436 filename = u
"%s/hotcomments.txt" %
singer_name
437 comments_list =
self.get_comments_list(filename)
438 comments_text =
"".join(comments_list)
439 cut_text =
" ".join(jieba.cut(comments_text))
# 将jieba分词得到的关键词用空格连接成为字符串
440 d = path.dirname(
__file__)
# 当前文件文件夹所在目录
441 color_mask = imread(pic_path)
# 读取背景图片
442 cloud = WordCloud(font_path=path.join(d,
'simsun.ttc'),background_color=
'white',mask=color_mask,max_words=2000,max_font_size=40
)
443 word_cloud = cloud.generate(cut_text)
# 产生词云
444 if singer_name ==
False:
445 name =
song_name
446 else:
447 name =
singer_name
448 word_cloud.to_file(u
"%s/%s.jpg" %
(name,name))
449 print(u
"成功生成%s.jpg" %
name)
450
451 # 对一首歌曲绘制其某一年某几个月的评论分布
452 # date_lists 为要绘制的月份
453 def sub_plot_months(self,song_name,DateLists,settings,row,col):
454 n =
len(DateLists)
455 row = row
# 行
456 col = col
# 列
457 filename = u
"%s/comments_num_by_Ymd.txt" %
song_name
458 date_lists =
[]
459 y_count =
[]
460 with codecs.open(filename,
"r",encoding =
'utf-8') as f:
461 lists =
f.readlines()
462 del lists[0]
# 删除头部信息
463 for content
in lists:
464 date_lists.append(content.split(
" ")[0])
# 添加日期信息
465 y_count.append(int(content.split(
" ")[1]))
# 添加数量信息
466 for i
in range(n):
467 plt.subplot(row,col,i+1
)
468 x_date = [date
for date
in date_lists
if re.match(r
"%s" %
DateLists[i],date)]
469 y = [y_count[j]
for j
in range(len(y_count))
if re.match(r
"%s" %
DateLists[i],date_lists[j])]
470 x = [datetime.strptime(d,
"%Y-%m-%d").date()
for d
in x_date]
471 plt.gca().xaxis.set_major_formatter(mdates.DateFormatter(
"%Y-%m-%d"))
472 plot_type = settings[i][
'plot_type']
473 if(plot_type ==
'plot'):
474 plt.plot(x,y,color = settings[i][
'color'])
475 elif(plot_type ==
'bar'):
476 plt.bar(x,y,width=settings[i][
'bar_width'],color = settings[i][
'color'])
477 else:
478 plt.scatter(x,y,color = settings[i][
'color'])
479 plt.gcf().autofmt_xdate(rotation=settings[i][
'rotation'])
# 自动旋转日期标记
480 plt.title(u
"《%s》%s到%s" % (song_name,x[0],x[-1]),fontsize = settings[i][
'fontsize'])
481 plt.xlabel(u
"日期")
482 plt.ylabel(u
"评论数目")
483 plt.xticks(pd.date_range(x[0],x[-1],freq=
"%s" % settings[i][
'time_distance']))
# 设置日期间隔
484 plt.subplots_adjust(left=0.09, bottom=0.27, right=0.89, top=0.83,hspace=0.35,wspace=0.35
)
485 plt.show()
486
487 # 绘制一首歌曲评论者相关信息的分布
488 def sub_plot_commenters_info(self,song_or_singer_name):
489 file_name = u
"%s/commenters_info.txt" %
song_or_singer_name
490 with codecs.open(file_name,
'r',encoding=
'utf-8') as f:
491 info_lists =
f.readlines()
492 del info_lists[0]
# 删除头部信息
493 event_count_list = []
# 动态总数
494 follow_count_list = []
# 关注总数
495 fan_count_list = []
# 粉丝总数
496 area_list = []
# 所在地区
497 age_list = []
# 年龄
498 listen_songs_num_list = []
# 累计听歌数目
499 for info
in info_lists:
500 info.replace(
"\n",
"")
501 event_count_list.append(int(info.split(
" ")[1
]))
502 follow_count_list.append(int(info.split(
" ")[2
]))
503 fan_count_list.append(int(info.split(
" ")[3
]))
504 area_res= re.search(re.compile(u
'.*\d (.+?-.+?) .*?|.*(未知地区).*'),info)
505 if(area_res):
506 if(area_res.group(1
)):
507 area_list.append(area_res.group(1
))
508 age_list.append(info.split(
" ")[-2
])
509 listen_songs_num_list.append(int(info.split(
" ")[-1
]))
510 event_count =
[0,0,0,0]
511 follow_count =
[0,0,0,0,0]
512 fan_count =
[0,0,0,0,0]
513 listen_songs_num =
[0,0,0,0]
514 area_count =
[0,0,0,0,0,0]
515 age_count =
[0,0,0,0,0]
516 for content
in event_count_list:
517 if(content <= 10
):
518 event_count[0] += 1
519 elif(content <= 50
):
520 event_count[1] += 1
521 elif(content <= 100
):
522 event_count[2] += 1
523 else:
524 event_count[3] += 1
525 for content
in follow_count_list:
526 if(content < 10
):
527 follow_count[0] += 1
528 elif(content < 30
):
529 follow_count[1] += 1
530 elif(content < 50
):
531 follow_count[2] += 1
532 elif(content < 100
):
533 follow_count[3] += 1
534 else:
535 follow_count[4] += 1
536 for content
in fan_count_list:
537 if(content < 10
):
538 fan_count[0] += 1
539 elif(content < 100
):
540 fan_count[1] += 1
541 elif(content < 1000
):
542 fan_count[2] += 1
543 elif(content < 10000
):
544 fan_count[3] += 1
545 else:
546 follow_count[4] += 1
547 area_no_repeat_list = list(set(area_list))
# 去除重复
548 area_tuple = [(area,area_list.count(area))
for area
in area_no_repeat_list]
549 area_tuple.sort(key=
lambda x:x[1],reverse=True)
# 从高到低排列
550 for i
in range(5):
# 取出排名前4的地区
551 area_count[i] = area_tuple[i][1
]
552 area_count[5] = sum([x[1]
for x
in area_tuple[5:]])
# 前5名之外的全部地区数量
553 area_labels = [x[0]
for x
in area_tuple[0:5]]
# 前5个地区的名字
554 area_labels.append(u
"其他地区")
555 age_no_repeat_list = list(set(age_list))
# 去除重复
556 age_info = [age_list.count(age)
for age
in age_no_repeat_list]
557 for index,age_
in enumerate(age_no_repeat_list):
558 if(age_ != u
"未知年龄"):
# 排除未知年龄
559 if(int(age_) <= 17
):
560 age_count[0] += age_info[index]
# 00后
561 elif(int(age_)<=22):
# 95后
562 age_count[1] +=
age_info[index]
563 elif(int(age_)<=27):
# 90后
564 age_count[2] +=
age_info[index]
565 elif(int(age_)<=37):
# 80后
566 age_count[3] +=
age_info[index]
567 else:
568 age_count[4] += age_info[index]
# 80前
569 age_labels = [u
"00后",u
"95后",u
"90后",u
"80后",u
"80前"]
570
571 for content
in listen_songs_num_list:
572 if(content < 100
):
573 listen_songs_num[0] += 1
574 elif(content < 1000
):
575 listen_songs_num[1] += 1
576 elif(content < 10000
):
577 listen_songs_num[2] += 1
578 else:
579 listen_songs_num[3] += 1
580 for i
in range(6
):
581 if(i ==
0):
582 title = u
"%s:评论者<动态数目>分布" %
song_or_singer_name
583 labels = [u
"0-10",u
"10-50",u
"50-100",u
"100以上"]
584 colors = [
"red",
"blue",
"yellow",
"green"]
585 x =
event_count
586 plt.subplot(2,3,i+1
)
587 plt.pie(x,colors=colors,labels=labels,autopct=
"%1.1f%%")
588 plt.title(title)
589 # plt.legend(labels)
590 elif(i == 1
):
591 title = u
"%s:评论者<关注人数>分布" %
song_or_singer_name
592 labels = [u
"0-10",u
"10-30",u
"30-50",u
"50-100",u
"100以上"]
593 colors = [
"red",
"blue",
"yellow",
"green",
"white"]
594 x =
follow_count
595 plt.subplot(2,3,i+1
)
596 plt.pie(x,colors=colors,labels=labels,autopct=
"%1.1f%%")
597 plt.title(title)
598 # plt.legend(labels)
599 elif(i == 2
):
600 title = u
"%s:评论者<粉丝人数>分布" %
song_or_singer_name
601 labels = [u
"0-10"<
来源: http://www.cnblogs.com/lyrichu/p/6684939.html