python爬虫初学之:爬取网页图片

#!/usr/bin/env python3.5
# -*-coding:utf-8 -*-
"""
作者:MR RaoJL
日期:'Sun Dec 25 12:28:08 2016'
用途:爬 www.aitaotu.com/guonei 网站的图片地址
运行环境:Python3.5(widows或linux都可以)主要在linux下测试的
现有的问题:爬取速度太慢
初学者,问题估计太多
"""
from itertools import chain #合并2DList
import requests #HTTP请求
import re #正则表达式
import os #创建目录,识别操作系统
import sys #创建目录,识别操作系统
import getpass #获取当前操作系统用户名
URL_IMG_RE=[] #通过正则过滤后的图片地址
TITLE=[] #标题
PAGE=[]  #每一组图片的页数
d=[] #用于查找图片地址
url = ' #主URL 
ListURL = requests.get(url+'guonei/').text #从子页中获取图片组的URL
ListPage = int(re.search('list.[0-9]{3}',ListURL).group().replace('list_',''))#获取/guonei下的子页个数
IMG_URL = [] 
RE_PLA_1 = "[u'<span class=\"totalpage\">" #用来过滤
RE_PLA_2 = '</span>\']'
img_AD = []
IMG = []
MD='/'
MH=':'
HD='://'
def MAIN():#主函数
    for i in imgurl:
        IMG_URL.append(url + i)
    for i in IMG_URL:
        PAGE.append(str(re.findall('.span.class=\"totalpage\".\d+./span.',requests.get(i).text)).replace(RE_PLA_1,'').replace(RE_PLA_2,''))
    print(TITLE)
    ALL=dict(zip(IMG_URL,PAGE)) #把个页面的URL和页数合并成字典
    for i in ALL:#遍历字典
        URL = i
        assert isinstance(URL, object)
        d.append(URL)
        j = 1
        for e in ALL.values():
            while j < int(e):
                j += 1
                if requests.get(URL.replace('.html','_'+str(j)+'.html')).status_code == 200:
                    d.append(URL.replace('.html','_'+str(j)+'.html'))
                    assert isinstance(URL, object)
                    print('GET :'+URL.replace('.html', '_' + str(j) + '.html'))
                    requests.get(URL.replace('.html', '_' + str(j) + '.html')).close()
                else:
                    requests.get(URL.replace('.html', '_' + str(j) + '.html')).close()
                    #print(URL.replace('.html','_'+str(j)+'.html'),'请求状态码为404,丢弃!')
    for i in set(d):
        IMG.append(re.findall('\w+://img\.\w+\.\w+.?[0-9]+.\w+.[0-9]+.\d+.\d+.\d+\.jpg', requests.get(i).text))
def SAVE_():#保存图片URL的函数
        if sys.platform == 'linux2':
            AD=open('/tmp/img_url','w')
        elif sys.platform == 'windows':
            AD=open('c:/Users/'+getpass.getuser()+'/Desktop/img_url.txt','w')
        else:
            print('不能创建文件{0}'.format('img_url'))
            sys.exit()
        for ITEM in chain.from_iterable(IMG):
                URL_IMG_RE.append(ITEM)
                print('写入图片URL:', str(ITEM), '到文件:/tmp/img_url')
                AD.writelines(ITEM+'\n')
                if True:
                    print('图片URL写入成功!')
                else:
                    print("写入失败@_@")
        AD.close()
def loop():#循环请求
    taoal=0
    print("开始爬取图片共{0}个父页面".format(ListPage))
    while taoal < ListPage:
        taoal+=1
        if taoal == 1:
            F_get=requests.get(url+'guonei').text
        else:
            F_get = requests.get('{0}guonei/list_{1}.html'.format(url, str(taoal))).text
        global imgurl
        imgurl=set(re.findall('guonei\/[0-9]{2,6}.html',F_get))
        imgurl = set(re.findall('guonei/[0-9]{2,6}.html', F_get))
        imgurl.remove('guonei/11767.html')#这些主要是删除每个子页面都会有的URL
        imgurl.remove('guonei/10787.html')
        imgurl.remove('guonei/6125.html')
        imgurl.remove('guonei/7973.html')
        imgurl.remove('guonei/14707.html')
        imgurl.remove('guonei/6379.html')
        imgurl.remove('guonei/7092.html')
        imgurl.remove('guonei/9543.html')
        imgurl.remove('guonei/10543.html')
        imgurl.remove('guonei/6259.html')
        MAIN()
loop()
SAVE_()
if True:
    print ('爬取成功!总共 ' + str(len(URL_IMG_RE)) + " 张.")
else:
    print('爬取失败!')
来源:
与本文相关文章

暂无,快来抢沙发吧！