- #!/usr/bin/python
- # -*- coding:utf-8 -*-
- import re
- import os
- import urllib, urllib2, cookielib
- import shutil
- from BeautifulSoup import BeautifulSoup
- # ---- utils ----
- def normalize_url(url):
- return "http://" + url if cmp(url[0:7],"http://") != 0 else url
- def safeDir(dir):
- return dir.replace('/', '')
- # ---- variable ----
- homepagePrefix = "http://60dxw.comww1.baisex.me/forum-47-"
- homepageSuffix = ".html"
- threadPrefix = "http://60dxw.comww1.baisex.me/"
- homedir = "baixingge"
- # ---- login ----
- cookie = urllib2.HTTPCookieProcessor(cookielib.CookieJar())
- opener = urllib2.build_opener(cookie)
- # ---- file ----
- if (os.path.exists(homedir) == False):
- os.mkdir(homedir)
- os.chdir(homedir)
- # ---- crawl ----
- for page in range(1, 25):
- pageUrl = '{0}{1}{2}'.format(homepagePrefix,page,homepageSuffix)
- # ---- mkdir ----
- if (os.path.exists(str(page)) == False):
- os.mkdir(str(page))
- os.chdir(str(page))
- print pageUrl
- # ---- download ----
- html_body = urllib.urlopen(pageUrl).read()
- soup = BeautifulSoup(html_body)
- # ---- extract ----
- threaddUrls = []
- urlRaws = soup.findAll('th', attrs = {'class' : ['new', 'common']})
- urlPattern = re.compile(r'href="([^"]*)"')
- titlePattern = re.compile(r'>([^<]*)</a>')
- for urlRaw in urlRaws:
- h = urlPattern.search(str(urlRaw))
- t = titlePattern.search(str(urlRaw))
- threadUrl = h.group(1)
- threadTitle = t.group(1)
- if (os.path.exists(threadTitle) == False):
- os.mkdir(safeDir(threadTitle))
- else:
- continue
- os.chdir(safeDir(threadTitle))
- page_url = threadPrefix + threadUrl
- print "---->{0}".format(page_url)
- print "---->{0}".format(safeDir(threadTitle))
- page_body = urllib.urlopen(page_url).read()
- page_soup = BeautifulSoup(page_body)
- imgPattern = re.compile(r'img src="([^"]*)" onload')
- i = imgPattern.findall(str(page_soup))
- index = 0
- for img in i:
- print "-------->{0}".format(img)
- imgSuffix = img[img.rindex('.'):]
- imgName = "{0}{1}".format(str(index), imgSuffix)
- urllib.urlretrieve(img, imgName, None)
- index += 1
- os.chdir("../")
- os.chdir("../")
- #该片段来自于http://www.codesnippet.cn/detail/280520133601.html
来源: http://www.codesnippet.cn/detail/280520133601.html