- #items.py
- # -*- coding: utf-8 -*-
- # Define here the models for your scraped items
- #
- # See documentation in:
- import scrapy
- class YamaxunItem(scrapy.Item):
- price = scrapy.Field()
- descript = scrapy.Field()
- URL = scrapy.Field()
- Photo = scrapy.Field()
- #Yamaxun.py
- # -*- coding: utf-8 -*-
- from scrapy.selector import Selector
- try:
- from scrapy.spider import Spider
- except:
- from scrapy.spider import BaseSpider as Spider
- from scrapy.contrib.spiders import CrawlSpider,Rule
- from scrapy.contrib.linkextractors import LinkExtractor
- #from JD.misc.log import *
- from a.items import YamaxunItem
- class Yamaxun(CrawlSpider):
- name = 'Yamaxun'#唯一标识spider的名字 最后用scrapy crawl + name的值来运行爬虫
- #上面复制的URL集合用在此处,初始地址
- start_urls = [u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3AHuawei%20%E5%8D%8E%E4%B8%BA', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3A%E9%AD%85%E6%97%8F', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3ANokia%20%E8%AF%BA%E5%9F%BA%E4%BA%9A', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3ASamsung%20%E4%B8%89%E6%98%9F', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3ACoolpad%20%E9%85%B7%E6%B4%BE', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3ALenovo%20%E8%81%94%E6%83%B3', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3AApple', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3A%E5%B0%8F%E7%B1%B3', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3AASUS%20%E5%8D%8E%E7%A1%95', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3AHTC', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3ASony%20%E7%B4%A2%E5%B0%BC', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3APhilips%20%E9%A3%9E%E5%88%A9%E6%B5%A6', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3A21%E5%85%8B%E6%89%8B%E6%9C%BA', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3AZTE%20%E4%B8%AD%E5%85%B4', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3ABIRD%20%E6%B3%A2%E5%AF%BC', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3ANewsmy%20%sE7%BA%BD%E6%9B%BC', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3ATCL', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3AMicrosoft%20%E5%BE%AE%E8%BD%AF', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3A%E5%8A%AA%E6%AF%94%E4%BA%9A', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3ASOYES', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3A%E7%BE%8E%E5%9B%BE', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3A%E7%88%B1%E6%B4%BE%E5%B0%94', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3A%E7%99%BE%E5%8A%A0', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3ADazen%20%E5%A4%A7%E7%A5%9E', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3A%E4%B8%80%E5%8A%A0', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3A%E9%94%8B%E8%BE%BE%E9%80%9A', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3ABlackBerry%20%E9%BB%91%E8%8E%93', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3A%E9%87%91%E6%9D%A5', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3A%E8%8D%A3%E7%83%BD', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3A%E5%A4%A7%E6%98%BE', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3AGIONEE%20%E9%87%91%E7%AB%8B', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3AMANN', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3A%E6%AF%94%E9%85%B7', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3ASmartisan%20%E9%94%A4%E5%AD%90', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3A%E7%83%BD%E7%81%AB', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3AKonka%20%E5%BA%B7%E4%BD%B3', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3Avivo', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3A%E4%B8%B0%E5%87%AF%E8%BE%BE', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3A%E7%A7%BB%E5%8A%A8', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3A%E8%89%BE%E5%B0%94%E9%85%B7', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3AMelrose', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3A%E9%94%90%E6%97%8F', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3A%E6%B3%A2%E5%AF%BC', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3AM%C2%B7PARTY', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3AHaier%20%E6%B5%B7%E5%B0%94', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3AMeitu%20%E7%BE%8E%E5%9B%BE', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3A%E6%9C%97%E6%A0%BC', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3A%E7%99%BE%E5%90%88', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3ALOPPO%20%EF%BC%88%E5%A5%BD%E4%B9%8B%E5%A3%B0ai-ms%EF%BC%89', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3AChanghong%20%E9%95%BF%E8%99%B9', u'http://www.amazon.cn/s?ie=UTF8&page=1&rh=n%3A665002051%2Cp_4%3ACrucial%20%E8%8B%B1%E7%9D%BF%E8%BE%BE']
- #处理response
- def parse(self,response):
- #在处理response传来的页面时,先判断有无下一页,有的话链接加到URL的集合里面
- NextPage = response.xpath("//a[@id='pagnNextLink']/@href").extract()
- if(NextPage !=[]):
- self.start_urls.append('http://www.amazon.cn' + NextPage[0])
- #sites是每个手机商品的那个div块儿
- sites =response.xpath("//li[@class='s-result-item']").extract()
- #分别对每个手机商品的信息进行处理
- items = []
- for site in sites:
- Yamaxun = YamaxunItem()
- Yamaxun['price'] =Selector(text=site).xpath("//li/div/div[3]/div[1]/a/span/text()").extract() #手机价格
- Yamaxun['descript'] =Selector(text=site).xpath("//li/div/div[2]/div[1]/a/h2/text()").extract()#信息
- Yamaxun['URL'] = Selector(text=site).xpath("//li/div/div[2]/div[1]/a/@href").extract()
- Yamaxun['Photo'] = Selector(text=site).xpath("//div/div[1]/div/div/a/img/@src").extract()
- items.append(Yamaxun)
- # info('parsed ' + str(response))
- return items #item最后会传递给pipline处理
- def _process_request(self, request):
- # info('process ' + str(request))
- return request
- #pipelines.py
- # -*- coding: utf-8 -*-
- # Define your item pipelines here
- #
- # Don't forget to add your pipeline to the ITEM_PIPELINES setting
- # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
- from scrapy import signals
- import json
- import codecs
- class JsonWithEncodingPipeline(object):
- #spider中的item会传递到这儿处理,这里将其写入json文件中
- def __init__(self):
- self.file = codecs.open('Yamaxun_iPhone.json', 'w', encoding='utf-8')
- def process_item(self, item, spider):
- line = json.dumps(dict(item), ensure_ascii=False) + "\\n"
- self.file.write(line)
- return item
- def spider_closed(self, spider):
- self.file.close()
- #settings.py
- # -*- coding: utf-8 -*-
- # Scrapy settings for a project
- #
- # For simplicity, this file contains only the most important settings by
- # default. All the other settings are documented here:
- #
- # http://doc.scrapy.org/en/latest/topics/settings.html
- #
- BOT_NAME = 'a'
- SPIDER_MODULES = ['a.spiders']
- NEWSPIDER_MODULE = 'a.spiders'
- ITEM_PIPELINES={'a.pipelines.JsonWithEncodingPipeline':300}
- # Crawl responsibly by identifying yourself (and your website) on the user-agent
- #USER_AGENT = 'a (+http://www.yourdomain.com)'
- #该片段来自于http://www.codesnippet.cn/detail/0806201512785.html
来源: http://www.codesnippet.cn/detail/0806201512785.html