Scrapy 爬取新浪微博移动版用户首页第一条微博

大家好, 本月第一次更新.

最近找了一份关于爬虫的实习工作, 需要爬取较大量的数据, 这时就发现通过自己编写函数来实现爬虫效率太慢了; 于是又转回来用 scrapy, 以前稍微学习了一下, 这次刚好爬爬微博练练手, 而后再使用部分数据生成词云.

本次爬取的是新浪微博移动端( https://m.weibo.cn/ ), 爬取的数据是用户微博首页的第一条微博(如下图), 包括文字内容, 转发量, 评论数, 点赞数和发布时间, 还有用户名和其所在地区(后面可以分析不同地区微博用户的关心的热点话题).

一, 分析网页

获取用户微博入口 url

浏览发现使用的是使用 Ajax 渲染的网页, 微博数据 () 存储在 JSON 格式网页中, 所以思路是先通过微博数据得到用户 url(如下图), 再来爬取后续内容.

获取第一条微博数据

也是使用了 Ajax 渲染的网页, 跟上面一样找到网页入口就行了. 请求网址如下:

这样看网址的话毫无规律可言, 简化后发现就可以进入. 而且 containerid=107603(***)这里, 括号里的数字刚好是用户的 id 号, 因此我们可以通过这个来构造网页.

获取用户所在地区

用户所在地在其基本资料中, 如下图

地址为:

同样进行简化得到: https://m.weibo.cn/api/container/getIndex?containerid=230283(***)_-_INFO 其中括号里面是用户 id 号.

通过以上分析可知, 获取用户的 id 号是本次爬取数据的关键, 只需要用 id 构成网址, 后面的爬取就相对简单了. 下面是编程部分.

二, 编程爬取

scrapy startproject sinaweibo
scrapy genspider xxx(爬虫名) xxx(所在域名)

import scrapy
class SinaweiboItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    name = scrapy.Field()       #用户名
    first_news = scrapy.Field()     #首条微博
    dates = scrapy.Field()     #发布时间
    zhuanzai = scrapy.Field()       #转载数
    comment = scrapy.Field()        #评论数
    agree = scrapy.Field()      #点赞数
    city = scrapy.Field()       #所在地区

# -*- coding: utf-8 -*-
 import scrapy
 from sinaweibo.items import SinaweiboItem
 import JSON
 import re
 import copy
 class WeibodiyuSpider(scrapy.Spider):
     name = 'weibodiyu'  #爬虫名
     allowed_domains = ['m.weibo.cn']    #只在该域名内爬取
     start_urls = ['https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_4188_-_ctg1_4188&openApp=0&since_id=1'
                   ]
     def parse1(self, response):
         infos = JSON.loads(response.body)   #将内容转为 JSON 对象
         item = response.meta['item']    #利用 meta 方法传入 item
         city = response.meta['city']    #传入城市
         try:
             name = infos["data"]["cards"][0]["mblog"]["user"]["screen_name"]    #爬取名字
             first_news = re.findall('([\u4e00-\u9fa5]+)', str(infos["data"]["cards"][0]["mblog"]["text"]), re.S)    #爬取微博内容, 使用正则去除一些杂项如网页代码
             dates = infos["data"]["cards"][0]["mblog"]["created_at"]    #发布时间
             zhuanzai = infos["data"]["cards"][0]["mblog"]["reposts_count"]    #转载数
             comment = infos["data"]["cards"][0]["mblog"]["comments_count"]    #评论数
             agree = infos["data"]["cards"][0]["mblog"]["attitudes_count"]    #点赞数
             #将数据赋给 item
             item['name'] = name
             item['first_news'] = first_news
             item['dates'] = dates
             item['zhuanzai'] = zhuanzai
             item['comment'] = comment
             item['agree'] = agree
             item['city'] = city
             return item    #返回
         except IndexError or KeyError:
             pass
     def parse2(self, response):    #获取所在地区函数
         infos = JSON.loads(response.body)
         try:
             item = response.meta['item']    #传入 item
             city_cont = str(infos["data"]["cards"][1]["card_group"])
             city = re.findall('card_type.*? 所在地.*?item.*?:(.*?)}]', city_cont, re.S)[0].replace('\'', '').replace(
                 '','')    #城市
             item['city'] = city
             ids = response.meta['ids']    #传入 id 并赋给 ids 变量
             n_url1 = 'https://m.weibo.cn/api/container/getIndex?&containerid=107603' + ids
             yield scrapy.Request(n_url1, meta={'item': item, 'city': copy.deepcopy(city)}, callback=self.parse1)    #执行完上述命令后的步骤
         except IndexError or KeyError:
             pass
     def parse(self, response):
         datas = JSON.loads(response.body)
         item = SinaweiboItem()
         for i in range(0, 20):
             try:
                 ids = str(datas["data"]["cards"][i]["mblog"]["user"]["id"])    #获取用户 id
                 n_url2 = 'https://m.weibo.cn/api/container/getIndex?containerid=230283{}_-_INFO'.format(ids)
                 yield scrapy.Request(n_url2, meta={'item': item, 'ids': copy.deepcopy(ids)}, callback=self.parse2)    #进入 parse2 函数执行命令
             except IndexError or KeyError:
                 pass
         social_urls = [
             'https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_4188_-_ctg1_4188&openApp=0&since_id={}'.format(
                 str(i)) for i in range(2, 100)]
         celebritys_urls = [
             'https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_4288_-_ctg1_4288&openApp=0&since_id={}'.format(
                 str(j)) for j in range(1, 100)]
         hots_urls = ['https://m.weibo.cn/api/container/getIndex?containerid=102803&openApp=0&since_id={}'.format(str(t))
                      for
                      t in range(1, 100)]
         urls = celebritys_urls + social_urls + hots_urls    #入口网址
         for url in urls:
             yield scrapy.Request(url, callback=self.parse)

BOT_NAME = 'sinaweibo'
 SPIDER_MODULES = ['sinaweibo.spiders']
 NEWSPIDER_MODULE = 'sinaweibo.spiders'
 USER_AGENT: 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) ApplewebKit/537.36 (Khtml, like Gecko) Chrome/73.0.3683.103 Safari/537.36'    #消息头
 DOWNLOAD_DELAY = 0.5    #延时 0.5s
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
 #USER_AGENT = 'sinaweibo (+http://www.yourdomain.com)'
 FEED_URI = 'file:C:/Users/lenovo/Desktop/weibo.csv'    #存入文件位置
 FEED_FORMAT = 'csv'    #保存格式
 ITEM_PIPELINES= {'sinaweibo.pipelines.SinaweiboPipeline': 300}     #管道设置
 # Obey robots.txt rules
 ROBOTSTXT_OBEY = False
 FEED_EXPORT_ENCODING = 'UTF8'   #编码格式

import CSV
 import pandas as pd
 import jieba.analyse
 def get_ciyun(city):    #进行分词
     tags=jieba.analyse.extract_tags(str(city),topK=100,withWeight=True)
     for item in tags:
         print(item[0]+'\t'+str(int(item[1]*1000)))
 need_citys = ['北京', '上海', '湖南', '四川', '广东']
 beijing = []
 shanghai = []
 hunan = []
 sichuan = []
 gd = []
 pd.set_option('expand_frame_repr', True)    #可换行显示
 pd.set_option('display.max_rows', None)    #显示所有行
 pd.set_option('display.max_columns', None)    #显示所有列
 df = pd.read_csv('C:\\Users\lenovo\Desktop\weibo.csv')    #读取文件内容并转化为 dataframes 对象
 contents = df['first_news']    #取微博内容
 city = df['city']    #取城市
 for i in range(len(city)):
     if need_citys[0] in city[i]:    #判断并存入
         beijing.append(contents[i])
     elif need_citys[1] in city[i]:
         shanghai.append(contents[i])
     elif need_citys[2] in city[i]:
         hunan.append(contents[i])
     elif need_citys[3] in city[i]:
         sichuan.append(contents[i])
     elif need_citys[4] in city[i]:
         gd.append(contents[i])
     else:
         pass
 #输出
 get_ciyun(beijing)
 print('-'*20)
 get_ciyun(shanghai)
 print('-'*20)
 get_ciyun(hunan)
 print('-'*20)
 get_ciyun(sichuan)
 print('-'*20)
 get_ciyun(gd)

来源: https://www.cnblogs.com/berryguotoshare/p/10852404.html

与本文相关文章

暂无,快来抢沙发吧！