port 爬取 all lba item text request top
- 1 # -*- coding: utf-8 -*-
- 2 # scrapy爬取豆瓣电影top250
- 3
- 4 import scrapy
- 5 fromdouban.itemsimport DoubanItem
- 6
- 7
- 8 class DoubanspiderSpider(scrapy.Spider):
- 9name ="doubanspider"
- 10 # allowed_domains = ["movie.douban.com/top250"]注意这里的主页限制,一旦翻页可能超出范围
- 11start_urls = ['http://movie.douban.com/top250']
- 12
- 13 def parse(self, response):
- 14item = DoubanItem()
- 15 foreachinresponse.CSS('.article .grid_view li'):
- 16title = each.css('.item .hd .title:nth-child(1)::text').extract_first()
- 17content = each.css('.item .bd p::text').extract_first().strip()
- 18rating_num = each.css('.item .bd .star .rating_num::text').extract_first()
- 19quote = each.css('.item .bd .quote span::text').extract_first()
- 20image = each.css('.item .pic a img::attr(src)').extract_first()
- 21item['title'] = title
- 22item['content'] = content
- 23item['rating_num'] = rating_num
- 24item['quote'] = quote
- 25item['image'] = image
- 26
- 27 yield item
- 28
- 29 # 构造下一页的请求
- 30next = response.css('.paginator .next a::attr(href)').extract_first()
- 31 if next:
- 32url ='http://movie.douban.com/top250'+ next
- 33 print(url)
- 34 yieldscrapy.Request(url=url, callback=self.parse)
scrapy 爬取豆瓣电影 top250
来源: http://www.bubuko.com/infodetail-2142398.html