需求:爬取【安居客—广州—新楼盘】的数据,具体到每个楼盘的详情页的若干字段。
难点:楼盘类型各式各样: ,不同楼盘字段的名称不一样。然后同一种类型,比如住宅,又分为不同的情况,比如分为期房在售,现房在售,待售,尾盘。其他类型也有类似情况。所以字段不能设置固定住。
解决方案:目前想到的解决方案,第一种:scrapy 中 items.py 中不设置字段,spider 中爬的时候自动识别字段(也就是有啥字段就保留下来),然后返回字典存起来。第二种,不同字段的网页分别写规则单独抓取。显然不可取。我采用的是第一种方案。还有其他方案的朋友们,欢迎交流哈。
目标网址为:http://gz.fang.anjuke.com/ 该网页下的楼盘数据
示例楼盘网址:http://gz.fang.anjuke.com/loupan/canshu-298205.html?from=loupan_tab
开始编写 scrapy 脚本。建立工程步骤略过。
1、count.py
- 1 __author__ = 'Oscar_Yang'
- 2 #-*- coding= utf-8 -*-
- 3 """
- 4 查看mongodb存储状况的脚本count.py
- 5 """
- 6 import time
- 7 import pymongo
- 8 client = pymongo.MongoClient("localhost", 27017)
- 9 db = client["SCRAPY_anjuke_gz"]
- 10 sheet = db["anjuke_doc1"]
- 11
- 12 while True:
- 13 print(sheet.find().count())
- 14 print("____________________________________")
- 15 time.sleep(3)
- 1 """
- 2 entrypoint.py
- 3 """
- 4 from scrapy.cmdline import execute
- 5 execute(['scrapy', 'crawl', 'anjuke_gz'])
- 1 # -*- coding: utf-8 -*-
- 2 """
- 3 settings.py
- 4 """
- 5
- 6 # Scrapy settings for anjuke_gz project
- 7 #
- 8 # For simplicity, this file contains only settings considered important or
- 9 # commonly used. You can find more settings consulting the documentation:
- 10 #
- 11 # http://doc.scrapy.org/en/latest/topics/settings.html
- 12 # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
- 13 # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
- 14
- 15 BOT_NAME = 'anjuke_gz'
- 16
- 17 SPIDER_MODULES = ['anjuke_gz.spiders']
- 18 NEWSPIDER_MODULE = 'anjuke_gz.spiders'
- 19 MONGODB_HOST = "127.0.0.1"
- 20 MONGODB_PORT = 27017
- 21 MONGODB_DBNAME="SCRAPY_anjuke_gz"
- 22 MONGODB_DOCNAME="anjuke_doc1"
- 23
- 24 # Crawl responsibly by identifying yourself (and your website) on the user-agent
- 25 #USER_AGENT = 'anjuke_gz (+http://www.yourdomain.com)'
- 26
- 27 # Obey robots.txt rules
- 28 ROBOTSTXT_OBEY = False
- 29
- 30 # Configure maximum concurrent requests performed by Scrapy (default: 16)
- 31 #CONCURRENT_REQUESTS = 32
- 32
- 33 # Configure a delay for requests for the same website (default: 0)
- 34 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
- 35 # See also autothrottle settings and docs
- 36 #DOWNLOAD_DELAY = 3
- 37 # The download delay setting will honor only one of:
- 38 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
- 39 #CONCURRENT_REQUESTS_PER_IP = 16
- 40
- 41 # Disable cookies (enabled by default)
- 42 #COOKIES_ENABLED = False
- 43
- 44 # Disable Telnet Console (enabled by default)
- 45 #TELNETCONSOLE_ENABLED = False
- 46
- 47 # Override the default request headers:
- 48 #DEFAULT_REQUEST_HEADERS = {
- 49 # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
- 50 # 'Accept-Language': 'en',
- 51 #}
- 52
- 53 # Enable or disable spider middlewares
- 54 # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
- 55 #SPIDER_MIDDLEWARES = {
- 56 # 'anjuke_gz.middlewares.AnjukeGzSpiderMiddleware': 543,
- 57 #}
- 58
- 59 # Enable or disable downloader middlewares
- 60 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
- 61 #DOWNLOADER_MIDDLEWARES = {
- 62 # 'anjuke_gz.middlewares.MyCustomDownloaderMiddleware': 543,
- 63 #}
- 64
- 65 # Enable or disable extensions
- 66 # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
- 67 #EXTENSIONS = {
- 68 # 'scrapy.extensions.telnet.TelnetConsole': None,
- 69 #}
- 70
- 71 # Configure item pipelines
- 72 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
- 73 ITEM_PIPELINES = {
- 74 'anjuke_gz.pipelines.AnjukeGzPipeline': 300,
- 75 }
- 76
- 77 # Enable and configure the AutoThrottle extension (disabled by default)
- 78 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
- 79 #AUTOTHROTTLE_ENABLED = True
- 80 # The initial download delay
- 81 #AUTOTHROTTLE_START_DELAY = 5
- 82 # The maximum download delay to be set in case of high latencies
- 83 #AUTOTHROTTLE_MAX_DELAY = 60
- 84 # The average number of requests Scrapy should be sending in parallel to
- 85 # each remote server
- 86 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
- 87 # Enable showing throttling stats for every response received:
- 88 #AUTOTHROTTLE_DEBUG = False
- 89
- 90 # Enable and configure HTTP caching (disabled by default)
- 91 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
- 92 HTTPCACHE_ENABLED = True
- 93 HTTPCACHE_EXPIRATION_SECS = 0
- 94 HTTPCACHE_DIR = 'httpcache'
- 95 HTTPCACHE_IGNORE_HTTP_CODES = []
- 96 HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
接下来,是 items。因为没有设置字段,为默认的代码。
- 1 # -*- coding: utf-8 -*-
- 2
- 3 # Define here the models for your scraped items
- 4 #
- 5 # See documentation in:
- 6 # http://doc.scrapy.org/en/latest/topics/items.html
- 7
- 8 import scrapy
- 9
- 10
- 11 class AnjukeGzItem(scrapy.Item):
- 12 # define the fields for your item here like:
- 13 # name = scrapy.Field()
- 14 pass
接下来,是 piplines.py。在中设置了 mongodb 的配置。
- # -*- coding: utf-8 -*-
- # Define your item pipelines here
- #
- # Don't forget to add your pipeline to the ITEM_PIPELINES setting
- # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
- import pymongo
- from scrapy.conf import settings
- class AnjukeGzPipeline(object):
- def __init__(self):
- host=settings["MONGODB_HOST"]
- port=settings["MONGODB_PORT"]
- dbname=settings["MONGODB_DBNAME"]
- client=pymongo.MongoClient(port=port,host=host)
- tdb = client[dbname]
- self.post=tdb[settings["MONGODB_DOCNAME"]]
- def process_item(self,item,spider):
- info = dict(item)
- self.post.insert(info)
- return item
最后,是最主要的 spider.py
- 1 from scrapy.http import Request
- 2 import scrapy
- 3 from bs4 import BeautifulSoup
- 4 import re
- 5 import requests
- 6 """
- 7 spider脚本
- 8 """
- 9 class Myspider(scrapy.Spider):
- 10 name = 'anjuke_gz'
- 11 allowed_domains = ['http://gz.fang.anjuke.com/loupan/']
- 12 start_urls = ["http://gz.fang.anjuke.com/loupan/all/p{}/".format(i) for i in range(39)]
- 13
- 14 def parse(self, response):
- 15 soup = BeautifulSoup(response.text,"lxml")
- 16 content=soup.find_all(class_="items-name") #返回每个楼盘的对应数据
- 17 for item in content:
- 18 code=item["href"].split("/")[-1][:6]
- 19 real_href="http://gz.fang.anjuke.com/loupan/canshu-{}.html?from=loupan_tab".format(code) #拼凑出楼盘详情页的url
- 20 res=requests.get(real_href)
- 21 soup = BeautifulSoup(res.text,"lxml")
- 22 a = re.findall(r'<div class="name">(.*?)</div>', str(soup))
- 23 b = soup.find_all(class_="des")
- 24 data = {}
- 25 for (i, j) in zip(range(len(b)), a):
- 26 data[j] = b[i].text.strip().strip("\t")
- 27 data["url"] = real_href
- 28 yield data
下面是存入 mongodb 的情况。
因为针对不同的网页结构,爬取的规则是一个,所以爬取的时候就不能针对每个字段进行爬取,所以存到库里的数据如果要是分析的话还需要清洗。
在 python 中使用 mongodb 的查询语句,再配合使用 pandas 应该就很方便清洗了。
来源: http://www.cnblogs.com/coskaka/p/6165520.html