环境和工具: python2.7,scrapy
实验网站: http://www.27270.com/tag/333.html 爬去所有兔女郎图片, 下面的推荐需要过滤
逻辑: 分析网站信息, 下载图片和入库需要开启 ITEM_PIPELINES, 开启缩略图配置, 转移图片
-----settings.py
- ## 不按照 robots.txt
- ROBOTSTXT_OBEY = False
- ## 默认
- DOWNLOAD_DELAY = 3
- ## 关闭 cookie
- COOKIES_ENABLED = False
- ## 开启 ITEM_PIPELINES
- ITEM_PIPELINES = {
- 'MyPicSpider.pipelines.MyImagesPipeline': 300,
- 'MyPicSpider.pipelines.MysqlPipeline': 400
- }
- ## 存储路径
- IMAGES_STORE ='G:\\www\\scrapy_rpo\\pic\\meinv\\rabbit\\'
- ## 过滤图片
- IMAGES_MIN_HEIGHT = 110
- IMAGES_MIN_WIDTH = 110
- ## 缩略图片
- IMAGES_THUMBS = {'big': (270, 270),
- }
- ------items.py
- import scrapy
- class PicspiderItem(scrapy.Item):
- # define the fields for your item here like:
- # name = scrapy.Field()
- tag = scrapy.Field()
- image_urls = scrapy.Field()
- images_data = scrapy.Field()
- img_path = scrapy.Field()
- img_big_path = scrapy.Field()
- file_path = scrapy.Field()
----pipelines.py
- # -*- coding: utf-8 -*-
- # Define your item pipelines here
- #
- # Don't forget to add your pipeline to the ITEM_PIPELINES setting
- # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
- import scrapy,os,datetime
- from scrapy.pipelines.images import ImagesPipeline
- from scrapy.exceptions import DropItem
- import shutil,os,pymysql
- # 导入项目设置
- from scrapy.utils.project import get_project_settings
- #conn = pymysql.Connection(host="localhost", user="root", passwd="root", db='test', charset="UTF8")
- #cursor = conn.cursor()
- class MyImagesPipeline(ImagesPipeline):
- # 从项目设置文件中导入图片下载路径
- img_store = get_project_settings().get('IMAGES_STORE')
- def get_media_requests(self, item, info):
- '''多个 url'''
- for image_url in item['image_urls']:
- yield scrapy.Request(image_url)
- def item_completed(self, results, item, info, ):
- image_paths = [x["path"] for ok, x in results if ok]
- if not image_paths:
- raise DropItem("Item contains no images")
- file_path = item['file_path']
- # 定义分类保存的路径
- if os.path.exists(file_path) == False:
- os.mkdir(file_path)
- print image_paths
- ## pic == full/80dd7db02e4da4e63f05d9d49c1092fc7fdcb43e.jpg
- pic_list = []
- for v in image_paths:
- pic_name = v.replace('full/','')
- pic_small_name =pic_name.replace('.jpg','')+'_s.jpg'pic_big_name = pic_name.replace('.jpg','') + '_b.jpg'
- ## 获取创建的图片名字
- # 将文件从默认下路路径移动到指定路径下
- # 移动图片
- shutil.move(self.img_store + 'full\\'+pic_name, file_path + "\\" + pic_name)
- # 移动缩略图
- #shutil.move(self.img_store + 'thumbs\\small\\'+ pic_name, file_path + "\\" + pic_small_name)
- shutil.move(self.img_store + 'thumbs\\big\\' + pic_name, file_path + "\\" + pic_big_name)
- #img_path_dict['img_path'] = file_path + "\\" + pic_name
- #img_path_dict['img_small_path'] = file_path + "\\" + pic_small_name
- #img_path_dict['img_big_path'] = file_path + "\\" + pic_big_name
- img_path_dict = ('picture/meinv/rabbit/'+item['tag']+"/" + pic_name,'picture/meinv/rabbit/'+item['tag']+"/" +pic_big_name)
- pic_list.append(img_path_dict)
- item["img_path"] = pic_list
- return item
- ## 入库
- class MysqlPipeline(object):
- def __init__(self):
- self.conn = pymysql.Connection(host="localhost", user="root", passwd="root", db='test1', charset="UTF8")
- # 创建指针
- self.cursor = self.conn.cursor()
- def process_item(self, item, spider):
- ### 组装数据
- list = []
- datetime_now =datetime.datetime.now()
- datetime_now = datetime.datetime.now()
- datetime_str = '{0}-{1}-{2} {3}:{4}:{5}'.format(datetime_now.year, datetime_now.month, datetime_now.day,datetime_now.hour, datetime_now.minute, datetime_now.second)
- ## 增加 type
- result = self.cursor.execute(u"select id from network_type where RESOURCETYPE ='p'and TYPENAME='{0}'".format(item['tag']))
- if result==0:
- self.cursor.execute("insert into network_type(PID,RESOURCETYPE,TYPENAME)values(%s,%s,%s)",(2415,'p',item['tag']))
- typeid = self.cursor.lastrowid
- self.conn.commit()
- else:
- #tag_id = self.cursor.fetchall()
- #typeid = tag_id[0][0]
- return False
- types = ','+str(typeid)+','
- #print item['img_path']
- self.cursor.execute('select id from network_picture order by cast(id as SIGNED INTEGER) desc limit 0,1')
- old_id = self.cursor.fetchone()
- if old_id:
- id_n = str(int(old_id[0]) + 1)
- else:
- id_n = str(1)
- for v in item['img_path']:
- path1 = v[0]
- path2 = v[1]
- self.cursor.execute(u'select id from network_picture where FILEPATH="{0}"and fileScalPath="{1}"'.format(path1,path2))
- data = self.cursor.fetchone()
- if data:
- print u'该数据已经存在'
- else:
- a = (str(id_n),'',path1,'',types,0,datetime_str,path2)
- list.append(a)
- id_n = int(id_n) + 1
- print list
- self.cursor.executemany("insert into network_picture(ID,NAME,FILEPATH,FILESIZE,TYPES,STATUS,DATETIME,fileScalPath)values(%s,%s,%s,%s,%s,%s,%s,%s)", list)
- self.conn.commit()
- return item
----spider.py
- # -*- coding: utf-8 -*-
- import scrapy,os,urllib2
- from scrapy.linkextractors import LinkExtractor ## 引入 linkextractors 用于筛选链接和跟进链接, 还有很多功能, 可以去百度下
- from scrapy.spiders import CrawlSpider, Rule ## 定义 spider 的模板, 引入 Rule 规则
- from MyPicSpider.items import PicspiderItem ## 引入定义的 items.py
- # 导入项目设置
- from scrapy.utils.project import get_project_settings
- from bs4 import BeautifulSoup
- import time,pymysql
- headers = {'User_agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) ApplewebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'}
- conn = pymysql.Connection(host="localhost", user="root", passwd="root", db='test1', charset="UTF8")
- # 创建指针
- cursor = conn.cursor()
- class PicSpider(CrawlSpider): ## 继承模板 CrawlSpider 普通模板继承 Spider
- name = 'pic' ### 定义 spider 名 运行 ---$ scrapy crawl blog
- allowed_domains = ['www.27270.com'] ## 定义查找范围
- start_urls = ['http://www.27270.com/tag/333.html'] ### 初始 url
- #### 当有 follow=True 则会跟进该页面
- #### 原理就是 spider 在初始页面查找, 同时查找帖子详情页的 url 和下一个分页, 同时跟进下一个分页页面, 继续查找下一个分页页面和上面的详情页 url, 详情页面使用回调函数进行采集
- rules = (
- ### 爬去索引页并跟踪其中链接
- ### 查找 start_urls 所有的分页页面
- Rule(LinkExtractor(allow=r'/tag/[0-9]*_[0-9]*.html'),follow=True),
- ### 爬去 items 页面并将下载响应返回个头 parse_item 函数
- #### 查询每个分页页面的详情页
- Rule(LinkExtractor(allow=r'http://www.27270.com/ent/[a-z]*/[0-9]*/[0-9]*.html'), callback='parse_item', follow=False,),
- #Rule(LinkExtractor(allow=r'http://www.27270.com/zhuangxiusheji/[0-9]*/[0-9]*.html'), callback='parse_item', follow=False),
- )
- #### 详情页面回调函数
- def parse_item(self,response):
- start_url = response.url
- item = PicspiderItem()
- tag_name = response.xpath('//h1[@class="articleV4Tit"]/text()').extract()[0]
- # cursor.execute(u'select id from network_type where PID=258 AND TYPENAME="{0}"limit 0,1'.format(tag_name))
- # old_id = cursor.fetchone()
- # if old_id:
- # exit()
- name = u'人体'
- if name in tag_name:
- pass
- else:
- print u'---- 这是其他的分类 ----'
- return False
- li_list = response.xpath('//ul[@class="articleV4Page l"]/li').extract()
- srcs = []
- for v in range(1, (len(li_list) - 3)):
- if v == 1:
- url_s = start_url
- else:
- url_s = start_url.replace('.html', '') +'_'+ str(v) +'.html'
- try:
- request = urllib2.Request(url_s, headers=headers)
- response = urllib2.urlopen(request, timeout=200).read()
- except urllib2.URLError, err:
- print err, '错误的 url' + url
- obj = BeautifulSoup(response, 'html.parser')
- try:
- pic_url = obj.find('center').find('img')['src']
- except:
- print u'---- 第一种获取方式失败 ----'
- try:
- pic_url = obj.find('div', {'id': 'picBody'}).find('img')['src']
- except:
- print u'---- 第二种方式获取失败 ----'
- try:
- pic_url = obj.find('p', attrs={"style": "text-align: center"}).find('img')['src']
- except:
- print u'---- 第三种获取方式失败 ----'
- srcs.append(pic_url)
- item['tag'] = tag_name
- item['file_path'] = '%s%s' %(get_project_settings().get('IMAGES_STORE'),tag_name)
- item['image_urls'] = srcs
- return item
------scrapy 的去重方面我还不是特别了解, 有知道的大佬可以告知本白, 谢谢.
来源: https://www.cnblogs.com/shuangzikun/p/python_taotao_scrapy_pic_mysql.html