- #!/usr/bin/env python3.5
- # -*-coding:utf-8 -*-
- """
- 作者:MR RaoJL
- 日期:'Sun Dec 25 12:28:08 2016'
- 用途:爬 www.aitaotu.com/guonei 网站的图片地址
- 运行环境:Python3.5(widows或linux都可以)主要在linux下测试的
- 现有的问题:爬取速度太慢
- 初学者,问题估计太多
- """
- from itertools import chain #合并2DList
- import requests #HTTP请求
- import re #正则表达式
- import os #创建目录,识别操作系统
- import sys #创建目录,识别操作系统
- import getpass #获取当前操作系统用户名
- URL_IMG_RE=[] #通过正则过滤后的图片地址
- TITLE=[] #标题
- PAGE=[] #每一组图片的页数
- d=[] #用于查找图片地址
- url = ' #主URL
- ListURL = requests.get(url+'guonei/').text #从子页中获取图片组的URL
- ListPage = int(re.search('list.[0-9]{3}',ListURL).group().replace('list_',''))#获取/guonei下的子页个数
- IMG_URL = []
- RE_PLA_1 = "[u'<span class=\"totalpage\">" #用来过滤
- RE_PLA_2 = '</span>\']'
- img_AD = []
- IMG = []
- MD='/'
- MH=':'
- HD='://'
- def MAIN():#主函数
- for i in imgurl:
- IMG_URL.append(url + i)
- for i in IMG_URL:
- PAGE.append(str(re.findall('.span.class=\"totalpage\".\d+./span.',requests.get(i).text)).replace(RE_PLA_1,'').replace(RE_PLA_2,''))
- print(TITLE)
- ALL=dict(zip(IMG_URL,PAGE)) #把个页面的URL和页数合并成字典
- for i in ALL:#遍历字典
- URL = i
- assert isinstance(URL, object)
- d.append(URL)
- j = 1
- for e in ALL.values():
- while j < int(e):
- j += 1
- if requests.get(URL.replace('.html','_'+str(j)+'.html')).status_code == 200:
- d.append(URL.replace('.html','_'+str(j)+'.html'))
- assert isinstance(URL, object)
- print('GET :'+URL.replace('.html', '_' + str(j) + '.html'))
- requests.get(URL.replace('.html', '_' + str(j) + '.html')).close()
- else:
- requests.get(URL.replace('.html', '_' + str(j) + '.html')).close()
- #print(URL.replace('.html','_'+str(j)+'.html'),'请求状态码为404,丢弃!')
- for i in set(d):
- IMG.append(re.findall('\w+://img\.\w+\.\w+.?[0-9]+.\w+.[0-9]+.\d+.\d+.\d+\.jpg', requests.get(i).text))
- def SAVE_():#保存图片URL的函数
- if sys.platform == 'linux2':
- AD=open('/tmp/img_url','w')
- elif sys.platform == 'windows':
- AD=open('c:/Users/'+getpass.getuser()+'/Desktop/img_url.txt','w')
- else:
- print('不能创建文件{0}'.format('img_url'))
- sys.exit()
- for ITEM in chain.from_iterable(IMG):
- URL_IMG_RE.append(ITEM)
- print('写入图片URL:', str(ITEM), '到文件:/tmp/img_url')
- AD.writelines(ITEM+'\n')
- if True:
- print('图片URL写入成功!')
- else:
- print("写入失败@_@")
- AD.close()
- def loop():#循环请求
- taoal=0
- print("开始爬取图片共{0}个父页面".format(ListPage))
- while taoal < ListPage:
- taoal+=1
- if taoal == 1:
- F_get=requests.get(url+'guonei').text
- else:
- F_get = requests.get('{0}guonei/list_{1}.html'.format(url, str(taoal))).text
- global imgurl
- imgurl=set(re.findall('guonei\/[0-9]{2,6}.html',F_get))
- imgurl = set(re.findall('guonei/[0-9]{2,6}.html', F_get))
- imgurl.remove('guonei/11767.html')#这些主要是删除每个子页面都会有的URL
- imgurl.remove('guonei/10787.html')
- imgurl.remove('guonei/6125.html')
- imgurl.remove('guonei/7973.html')
- imgurl.remove('guonei/14707.html')
- imgurl.remove('guonei/6379.html')
- imgurl.remove('guonei/7092.html')
- imgurl.remove('guonei/9543.html')
- imgurl.remove('guonei/10543.html')
- imgurl.remove('guonei/6259.html')
- MAIN()
- loop()
- SAVE_()
- if True:
- print ('爬取成功!总共 ' + str(len(URL_IMG_RE)) + " 张.")
- else:
- print('爬取失败!')
来源: