抓取新闻列表中所有分页的新闻详情, 包括标题, 正文, 时间, 来源等信息.
创建项目
scrapy startproject China
scrapy genspider -t crawl chinatech
- items.py
- from scrapy import Field, Item
- class ChinaItem(Item):
- # define the fields for your item here like:
- # name = scrapy.Field()
- title = Field()
- text = Field()
- datetime = Field()
- source = Field()
- url = Field()
- website = Field()
- chinatech.py
- import scrapy
- from scrapy.linkextractors import LinkExtractor
- from scrapy.spiders import CrawlSpider, Rule
- from China.items import *
- from China.loaders import *
- class ChinatechSpider(CrawlSpider):
- name = 'chinatech'
- allowed_domains = ['tech.china.com']
- start_urls = ['http://tech.china.com/articles/']
- rules = (
- Rule(LinkExtractor(allow='article\/.*\.html', restrict_xpaths='//div[@id="left_side"]//div[@class="con_item"]'),
- callback='parse_item'),
- Rule(LinkExtractor(restrict_xpaths='//div[@id="pageStyle"]//a[contains(.," 下一页 ")]'))
- )
- def parse_item(self, response):
- loader = ChinaLoader(item=ChinaItem(), response=response)
- loader.add_xpath('title', '//h1[@id="chan_newsTitle"]/text()')
- loader.add_value('url', response.url)
- loader.add_xpath('text', '//div[@id="chan_newsDetail"]//text()')
- loader.add_xpath('datetime', '//div[@id="chan_newsInfo"]/text()', re='(\d+-\d+-\d+\s\d+:\d+:\d+)')
- loader.add_xpath('source', '//div[@id="chan_newsInfo"]/text()', re='来源:(.*)')
- loader.add_value('website', '中华网')
- yield loader.load_item()
- loads.py
- from scrapy.loader import ItemLoader
- from scrapy.loader.processors import TakeFirst, Join, Compose
- class NewsLoader(ItemLoader):
- default_output_processor = TakeFirst()
- class ChinaLoader(NewsLoader):
- text_out = Compose(Join(), lambda s: s.strip())
- source_out = Compose(Join(), lambda s: s.strip())
- pipelines.py
- import json
- class ChinaPipeline(object):
- def __init__(self):
- self.filename = open("china.json", "w")
- def process_item(self, item, spider):
- text = json.dumps(dict(item), ensure_ascii = False) + ",\n"
- self.filename.write(text)
- return item
- def close_spider(self, spider):
- self.filename.close()
- settings.py
- BOT_NAME = 'China'
- SPIDER_MODULES = ['China.spiders']
- NEWSPIDER_MODULE = 'China.spiders'
- ROBOTSTXT_OBEY = False
- ITEM_PIPELINES = {
- 'China.pipelines.ChinaPipeline': 300,
- }
来源: http://www.bubuko.com/infodetail-2663156.html