- 1 import os
- 2 import random
- 3 import threading
- 4 import winreg
- 5 fromqueueimport Queue
- 6 frombs4import BeautifulSoup
- 7 import requests
- 8
- 9
- 10luoo_site ='http://www.luoo.net/music/'
- 11luoo_site_mp3 ='http://mp3-cdn.luoo.net/low/luoo/radio%s/%s.mp3'
- 12user_agents = (
- 13 "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) ApplewebKit/532.5 (Khtml, like Gecko) Chrome/4.0.249.0 Safari/532.5",
- 14 "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.9 (KHTML, like Gecko) Chrome/5.0.310.0 Safari/532.9",
- 15 "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.514.0 Safari/534.7",
- 16 "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/9.0.601.0 Safari/534.14",
- 17 "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/10.0.601.0 Safari/534.14",
- 18 "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20",
- 19 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.27 (KHTML, like Gecko) Chrome/12.0.712.0 Safari/534.27",
- 20 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1",
- 21 )
- 22headers = {
- 23 'Host':'mp3-cdn.luoo.net',
- 24 'Accept':'audio/webm,audio/ogg,audio/wav,audio/*;q=0.9,application/ogg;q=0.7,video/*;q=0.6,*/*;q=0.5',
- 25 'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
- 26 'Referer':'http://www.luoo.net/music/896',
- 27 'Range':'bytes=0-',
- 28 'Connection':'keep-alive',
- 29 }
- 30
- 31
- 32 def get_desktop():
- 33key = winreg.OpenKey(winreg.HKEY_CURRENT_USER, 34r'Software\Microsoft\Windows\CurrentVersion\Explorer\Shell Folders',)
- 35 returnwinreg.QueryValueEx(key,"Desktop")[0]
- 36
- 37 def fix_characters(s):
- 38 forcin('<','>',':','"','/','\\','|','?','*'):
- 39s = s.replace(c,'')
- 40 return s
- 41
- 42 def fix_order(order):
- 43fix_order = str(order)
- 44 iforder < 10:
- 45fix_order ="0"+ fix_order
- 46 return fix_order
- 47
- 48
- 49 class LuooSpider(threading.Thread):
- 50 def __init__(self, url, vols, queue=None):
- 51threading.Thread.__init__(self)
- 52 print('[luoo spider]')
- 53 print('='* 20)
- 54self.queue = queue
- 55self.url = url
- 56self.vols = vols
- 57
- 58 def run(self):
- 59 forvolin self.vols:
- 60 self.spider(vol)
- 61 print('crawl end')
- 62
- 63 def spider(self, vol):
- 64url = luoo_site + str(vol)
- 65 print('?crawling: '+ url)
- 66res = requests.get(url)
- 67soup = BeautifulSoup(res.content.decode('utf-8'),'html.parser')
- 68 try:
- 69title = soup.find('span', attrs={'class':'vol-title'}).text
- 70 except:
- 71 print('Looks like nothing to do here?!')
- 72 return
- 73cover = soup.find('img', attrs={'class':'vol-cover'})['src']
- 74desc = soup.find('div', attrs={'class':'vol-desc'}).text
- 75author = soup.find('a', attrs={'class':'vol-author'}).text
- 76date = soup.find('span', attrs={'class':'vol-date'}).text
- 77track_infos = soup.find_all('li', attrs={'class':'track-item rounded'})
- 78track_count = len(track_infos)
- 79order = 1 80tracks = []
- 81 fortrackin track_infos:
- 82a_track = {}
- 83a_track['_order'] = fix_order(order)
- 84a_track['_id'] = track['id'][5:]
- 85a_track['_cover'] = track.find('img', attrs={'class':'cover rounded'})['src']
- 86a_track['_name'] = track.find('p', attrs={'class':'name'}).text
- 87a_track['_artist'] = track.find('p', attrs={'class':'artist'}).text[8:]
- 88a_track['_album'] = track.find('p', attrs={'class':'album'}).text[7:]
- 89 tracks.append(a_track)
- 90order += 1 91vols = {
- 92 'vol_num': vol,
- 93 'vol_title': title,
- 94 'vol_cover': cover,
- 95 'vol_desc': desc,
- 96 'vol_author': author,
- 97 'vol_date': date,
- 98 'track_count': track_count,
- 99 'tracks': tracks
- 100 }
- 101 self.queue.put(vols)
- 102
- 103
- 104 class LuooDownloader(threading.Thread):
- 105 def __init__(self, url, dist, queue=None):
- 106threading.Thread.__init__(self)
- 107self.url = url
- 108self.queue = queue
- 109self.dist = dist
- 110self.__counter= 0
- 111
- 112 def run(self):
- 113 while True:
- 114 if self.queue.qsize():
- 115phases = self.queue.get()
- 116 self.download(phases)
- 117
- 118 def download(self, phases):
- 119 fortrackinphases['tracks']:
- 120file_url = self.url % (phases['vol_num'], track['_order'])
- 121local_file_dict ='%s/%s'% (self.dist, phases['vol_num'])
- 122 if not os.path.exists(local_file_dict):
- 123 os.makedirs(local_file_dict)
- 124local_file ='%s/%s.%s.mp3'% (local_file_dict, track['_order'], track['_name'])
- 125 print('?processing: '+ track['_name'])
- 126 if not os.path.isfile(local_file):
- 127 print('?downloading: '+ track['_name'])
- 128res = requests.get(file_url, headers=headers).content
- 129 iflen(res) < 280:
- 130file_url = self.url % (phases['vol_num'], str(int(track['_order'])))
- 131res = requests.get(file_url, headers=headers).content
- 132with open(local_file,'wb') as f:
- 133 f.write(res)
- 134 print('?completed: '+ track['_name'])
- 135 else:
- 136 print('?skipped: '+ track['_name'])
- 137
- 138
- 139 if __name__=='__main__':
- 140vol_queue = Queue()
- 141luoo = LuooSpider(luoo_site, vols=range(1,1000), queue=vol_queue)
- 142 luoo.start()
- 143
- 144downloader_count = 10145 foriin range(downloader_count):
- 146headers['User-Agent'] = random.choice(user_agents)
- 147luoo_download = LuooDownloader(luoo_site_mp3, get_desktop()+'/luoo', queue=vol_queue)
- 148luoo_download.start()
来源: http://www.bubuko.com/infodetail-1982246.html