- #!/usr/bin/env python
- #-*-coding:utf-8-*-'
- #Filename:download_file.py
- import os,sys
- import re
- import urllib
- import urllib2
- base_url = 'xxx'
- array_url = list()
- pic_url = list()
- inner_url = list()
- def get_array_url(array_url,base_url):
- content = urllib.urlopen(base_url).read()
- array_url_a = re.findall(r'/rihan.*?.html',content)
- for url in array_url_a:
- url_a = 'xxx'+url
- #print url_a
- array_url.append(url_a)
- def get_inner_url(array_url,inner_url):
- inner_url.append(array_url)
- content = urllib.urlopen(array_url[10]).read()
- content = content.replace(" ","")
- url_a = re.findall(r'<li>.*?</li>',content)
- for i in url_a:
- url = re.findall(r'ahref=\\\\'.*?.html\\\\'target',i)
- if len(url)>0:
- # print url[0]
- url_b = re.sub(r'ahref=\\\\'','',url[0])
- # print url_b
- url_c = re.sub(r'\\\\'target','',url_b)
- url_c = 'http://xxx/'+re.sub(r'/.*/','',url_c)
- inner_url.append(url_c)
- del inner_url[1]
- # print inner_url
- def get_pic_url(pic_url,inner_url,array_url):
- content = urllib.urlopen(array_url).read()
- pic_url_a = re.findall(r'center.*?.jpg',content)
- print 'bbbbbbbbb',len(pic_url_a)
- pic_url_a = re.findall(r'http://.*.jpg',pic_url_a[0])
- pic_url.append(pic_url_a[0])
- j=2
- for i in inner_url:
- jj = '/'+str(j)+'.jpg'
- pic = re.sub(r'/1.jpg',jj,pic_url_a[0])
- pic_url.append(pic)
- j = j+1
- del pic_url[-1]
- for i in pic_url:
- print i
- def urlcallback(a,b,c):
- """
- call back function
- a,已下载的数据块
- b,数据块的大小
- c,远程文件的大小
- """
- print "callback"
- prec=100.0*a*b/c
- if 100 < prec:
- prec=100
- print "%.2f%%"%(prec,)
- def download(img_url,file_num):
- for img in img_url:
- print img
- img_name = re.sub(r'http://.*/','',img)
- path = 'C:/'+str(file_num)+'/'+img_name
- urllib.urlretrieve(img,path,urlcallback)
- get_array_url(array_url,base_url)
- file_num = 3
- #download(pic_url,file_num)
- get_inner_url(url,inner_url)
- get_pic_url(pic_url,inner_url,url)
- '''
- for url in array_url:
- print url
- # get_inner_url(url,inner_url)
- # get_pic_url(pic_url,inner_url)
- get_inner_url(url,inner_url)
- get_pic_url(pic_url,inner_url,url)
- download(pic_url,file_num)
- file_num = file_num+1
- del inner_url[:]
- del pic_url[:]
- '''
- #该片段来自于http://www.codesnippet.cn/detail/121120137098.html
来源: http://www.codesnippet.cn/detail/121120137098.html