使用 scrapy, 拼接 url, 找到翻页参数, 保存为 JSON
lj.py
- -- coding: utf-8 --
- import copy
- import re
- import time
- import scrapy
- from lianjia.items import LianjiaItem
- class LjSpider(scrapy.Spider):
- name = 'lj'
- allowed_domains = ['lianjia.com']
- start_urls = ['https://www.lianjia.com/city/']
- cookies_str = """lianjia_uuid=47a174e2-625a-4b22-a3c2-25fd1ec31b81; _ga=GA1.2.1898977159.1552302637; _gid=GA1.2.96347100.1552302637; lianjia_ssid=95f26562-a063-4c30-a718-28df4f75bc9c; _smt_uid=5c864d54.1796444c; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1552305492; gr_user_id=f98cfb4a-0fd7-48b6-bd92-84b72f7cda35; gr_session_id_a1a50f141657a94e=50c2d270-7331-4957-9f3c-ea51bf6dc831; gr_session_id_a1a50f141657a94e_50c2d270-7331-4957-9f3c-ea51bf6dc831=true; Hm_lpvt_9152f8221cb6243a53c83b956842be8a=1552307519; select_city=341100; lj_newh_session=eyJpdiI6IjRmZGdtYjR6Q3FEK2RoVVVBbGRib3c9PSIsInZhbHVlIjoicGR4N3hyZVwvRHN2dEFaR3pBY1Jodm1QVkZ2QVFuSTdia0RcL1wvVUpCU2JjZnpxTWRNWE9JTWxWOG1OZUZWMU52bXdZQ1wveGk0cUViK1hrZFVYblwvVlpiQT09IiwibWFjIjoiN2ZhOGU5N2Y0YWVmOGIyYjRmM2I4YTdmNzQzNDMxMzk5N2ZlYjQzNmU1MzI3OTQ0YTM3YjE4NDhlMDRkZTM2NyJ9"""
- cookies_dict = {i.split('=')[0]: i.split('=')[1] for i in cookies_str.split(';')}
- def parse(self, response):
- city_list = response.xpath('//div/ul/li/div/div/ul/li/a')
- for data in city_list:
- time.sleep(1)
- city_dict = LianjiaItem()
- city_dict['name'] = data.xpath('./text()').extract_first()
- city_dict['link'] = data.xpath('./@href').extract_first() + 'ershoufang/pg1co32/'
- yield scrapy.Request(city_dict['link'], encoding='utf-8', cookies=self.cookies_dict,
- callback=self.parse_city, meta={'city_h': copy.deepcopy(city_dict)})
- def parse_city(self, response):
- city_dict = response.meta['city_h']
- hourse_list = response.xpath('//div[1]/ul/li/div[1]')
- for hourse in hourse_list:
- hoursing_area = hourse.xpath('./div[3]/div/a/text()').extract_first()
- if hoursing_area:
- city_dict['hoursing_area'] = hoursing_area
- city_dict['hoursing_total_price'] = hourse.xpath('./div[6]/div[1]/span/text()').extract_first()
- city_dict['hoursing_unit_price'] = hourse.xpath('./div[6]/div[2]/span/text()').extract_first()
- yield city_dict
- city_url = response.xpath('//body/div[1]/div/ul/li[2]/a/@href').extract_first()
- page_total = re.findall(r'totalPage":(\d{1,3})', response.text)
- print(page_total)
- if page_total:
- pages = int(page_total[0])
- # 发送 下一页的请求
- for city_page in range(2, pages):
- if city_page> 3:
- return
- page_url = city_url + '/pg{}co32/'.format(city_page)
- yield scrapy.Request(page_url, callback=self.parse_city, meta={'city_h': city_dict})
pipelines.py
如果信息多的话用 JsonLinesItemExporter 保存更好
- from scrapy.exporters import JsonLinesItemExporter, JsonItemExporter
- class AqiJsonPipeline(object):
- def open_spider(self, spider):
- self.file = open('lianjia2.json', 'wb')
- self.writer = JsonItemExporter(self.file, ensure_ascii=False, encoding='utf-8')
- self.writer.start_exporting()
- def process_item(self, item, spider):
- self.writer.export_item(item)
- return item
- def close_spider(self, spider):
- self.writer.finish_exporting()
- self.file.close()
items.py
- import scrapy
- class LianjiaItem(scrapy.Item):
- # define the fields for your item here like:
- # name = scrapy.Field()
- name = scrapy.Field()
- link = scrapy.Field()
- hoursing_area = scrapy.Field()
- hoursing_total_price = scrapy.Field()
- hoursing_unit_price = scrapy.Field()
来源: http://www.bubuko.com/infodetail-2985552.html