- #! /usr/bin/env python
- # -*- coding=utf-8 -*-
- import urllib2,urllib
- import re,time,socket
- import os
- import sys
- import threading
- path = os.getcwd()
- new_path = os.path.join(path,r'mnsfz')
- if not os.path.isdir(new_path):
- os.mkdir(new_path)
- path1=new_path+'/'+r'List1.txt'
- k=open(path1,'wt')
- k.close()
- path2=new_path+'/'+r'List2.txt'
- g=open(path2,'wt')
- g.close()
- path3=new_path+'/'+r'List3.txt'
- g=open(path3,'wt')
- g.close()
- match1=r'<div class="bgyellow_bsb"><a href="(info_toplist.*?)"'#匹配下一页
- match2=r'(http.*?)#'#匹配txt1中的网址
- match3=r'<a href="(unit_info.*?ps=18)">'#匹配html1中的各图册pageurl
- match4=r'(unit_info.*?ps=18)'#匹配txt2中的地址
- match5=r'value="(http://.*?\\.jpg)" emptyok="true" />'#匹配打开的图册中图片的下载地址<input name="picurl" type="hidden" value="http://d4.lexun.net/d43/act/20150324/18/94798621.jpg" emptyok="true" />
- match6=r'"<a href="(unit_info.*?ps=18)">u"下一页"'#匹配打开图册的下一页
- match7=r'(http.*?\\.jpg)'#匹配txt3中的地址
- match8=r'<img src="(http.*?\\.jpg)" alt='#匹配原图下载地址'<a href="http.jpg">立即下载'
- match9=r'(http.*?\\.jpg)'#匹配原图下载页的真实imgurl
- url1=r'http://p.lexun.net/w/info_toplist.aspx?flag=1&ps=18&total=17967&total=17967&cd=0&lxt=404dd8b222b4d64dsggshhtgrq&vs=1&_r=451103666'
- def pageloop1(url1):
- for i in range(1,41):
- putintotxt(url1+r'#',path1)
- html=useragent(url1)
- bturl=geturl(match1,html)
- if bturl: src1=bturl[0]
- url1=r'http://p.lexun.net/w/'+src.replace(r'amp;','')
- def pageloop2(url2):
- print r'page',url2
- html2=useragent(url2)
- pagelist=geturl(match3,html2)
- putintotxt(pagelist,path2)
- def pageloop3(pageurl):
- url2=r'http://p.lexun.net/w/'+pageurl.replace(r'amp;','')
- # print r'next page',url2
- html3=useragent(url2)
- imglist=geturl(match5,html3)
- # print imglist
- putintotxt(imglist,path3)
- nextimgurl=geturl(match6,html3)
- if nextimgurl: src1=nextimgurl[0]
- pageurl2=r'http://p.lexun.net/w/'+src.replace(r'amp;','')
- pageloop3(pageurl2)
- def pageloop4(urlimg):
- try:
- name=os.path.basename(urlimg)
- size=os.path.isfile(new_path+'/'+name)
- if size==True:
- print u'已经存在'
- pass
- else:
- content=urllib2.urlopen(urlimg,None,timeout=20).read()
- with open(new_path+'/'+name,'wb') as code:
- code.write(content)
- if size==False:
- print u'需要host'
- useragent2(urlimg)
- else:
- print urlimg
- except:
- useragent2(urlimg)
- def useragent2(urlimg):
- try:
- url=r'http://app.lexun.com/resizepic/pic_zoomr.aspx?cd=0&lxt=404dd8b222b4d64dsggshhtgrq&vs=1&_r=3925580'+str(i)
- values={
- "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) ApplewebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36",
- "picurl":urlimg
- }
- data = urllib.urlencode(values)
- req = urllib2.Request(url, data)
- proxy_support = urllib2.ProxyHandler({'http':'http://190.79.62.76:8080'})
- opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler)
- urllib2.install_opener(opener)
- response = urllib2.urlopen(req)
- html = response.read()
- HTL=geturl(match8,html)
- print HTL[0]
- pageloop4(HTL[0])
- except:
- pass
- class getallpag(threading.Thread):
- def __init__(self,begin,end):
- threading.Thread.__init__(self)
- self.begin = begin
- self.end = end
- def run(self):
- for i in range(self.begin,self.end):
- pageloop2(ALLPAG[i])
- class getimgpag(threading.Thread):
- def __init__(self,begin,end):
- threading.Thread.__init__(self)
- self.begin = begin
- self.end = end
- def run(self):
- for i in range(self.begin,self.end):
- pageloop3(ALLPAG2[i])
- class getmypic(threading.Thread):
- def __init__(self,begin,end):
- threading.Thread.__init__(self)
- self.begin = begin
- self.end = end
- def run(self):
- for i in range(self.begin,self.end):
- pageloop4(ALLPIC[i])
- def geturl(match,html):
- reg=re.compile(match)
- URLNEXT=re.findall(reg,html)
- return URLNEXT
- def putintotxt(url,path):
- with open (path,'a+') as code:
- code.writelines(url)
- def useragent(url):
- try:
- html = urllib2.urlopen(url,None,timeout=10).read()
- #time.sleep(1)
- except:
- html=r'123456'
- pass
- return html
- def listmk(path,match):
- f=open(path,'r+')
- allurl=f.readlines()
- f.close
- reg=re.compile(match)
- urllist=re.findall(reg,allurl[0])
- return urllist
- pageloop1(url1)
- ALLPAG=listmk(path1,match2)
- l=len(ALLPAG)
- print l
- if __name__ == '__main__':
- threads = []
- m=1
- n=10
- while(1):
- threads.append(getallpag(m-1,n-1))
- m+=10
- n+=10
- if n-1>l:
- break
- for t in threads:
- t.start()
- for t in threads:
- t.join()
- ALLPAG2=listmk(path2,match4)
- l2=len(ALLPAG2)
- print l2
- if __name__ == '__main__':
- threads = []
- m=0
- n=100
- while(1):
- threads.append(getimgpag(m,n))
- m+=101
- n+=100
- if n>l2:
- break
- for t in threads:
- t.start()
- for t in threads:
- t.join()
- ALLPIC=listmk(path3,match7)
- print u'一共:',len(ALLPIC)
- if __name__ == '__main__':
- threads = []
- i=0
- j=100
- kl=len(ALLPIC)
- while(1):
- threads.append(getmypic(i,j))
- i+=101
- j+=100
- if j>kl:
- break
- for t in threads:
- t.start()
- # 等待子线程结束
- for t in threads:
- t.join()
- print "the end!!"
- #该片段来自于http://www.codesnippet.cn/detail/3003201512024.html
来源: http://www.codesnippet.cn/detail/3003201512024.html