老司机带你学python 用Python开车的老司机

废话不多说，驾照在此：

#coding = utf - 8import urllibimport reimport osimport urlparse#用于匹配正则式,
返回匹配结果#用法regFind(表达式, 文本内容) def regFind(reg, text) : pattern = re.compile(reg) return re.findall(pattern, text)#用于获取html源,
返回对应文本内容def getHtml(url) : page = urllib.urlopen(url) html = page.read() return html#保存图片列表至对应路径#用法saveGraph(路径, 图片url列表) def saveGraph(readyPath, imglist) : x = 0
for imgurl in imglist: urllib.urlretrieve(imgurl, readyPath + '\\%s.jpg' % x) x += 1
return x#抽取html内的图像def getImg(html) : imglist = regFind(r '', html)#解析图像列表print "解析出图像" + str(len(imglist)) + '张'titlelist = regFind(r '[\s\S]*[\s\S]*', html)#解析出标题htmlTitle = titlelist[0]#从list中提取出title的字符串变量print htmlTitle readyPath = '.\\GraphFile\\' + htmlTitle x = 0#图片计数器
if not(os.path.exists(readyPath) and len(os.listdir(readyPath)) == len(imglist)) : #如果不同时满足 (文件夹存在和图片下载完整)则需要新下载所有内容#尝试创建目录
if not os.path.exists(readyPath) : try: os.mkdir(readyPath) except Exception,
e: print "文件夹创建异常:",
htmlTitle,
Exception,
":",
e#防止新建文件夹异常#录建立后下载图片
if os.path.exists(readyPath) : x = saveGraph(readyPath, imglist) print '获取图片' + str(x) + '张'class Config() : def __init__(self, configFileName = 'store_html.ini') : self.configFileName = configFileName def LoadConfig(self) : if not os.path.exists(self.configFileName) : return ''file_object = open(self.configFileName, 'r') try: fileText = file_object.read()
finally: file_object.close() return fileText def SaveConfig(self, text) : file_object = open(self.configFileName, 'w') try: all_the_text = file_object.write(text)
finally: file_object.close() if __name__ == "__main__": ConfigObj = Config() webAddr = ConfigObj.LoadConfig() if webAddr == '': print "目前没有读到上次结束的地址"
while 1 : text = input('输入一个url,退出输入exit: ') if text == 'exit': exit() elif text != '': webAddr = text
break
else: print '输入无效'deepth = 2#爬虫深度print '本次服务将执行:' + str(deepth) + '次'#尝试创建子文件夹readyPath = '.\\GraphFile'
if not os.path.exists(readyPath) : os.mkdir(readyPath) for i in range(0, deepth) : print '--------------------分割线----------------------------'print unicode('URL地址为 ', "utf8") + webAddr htmlText = getHtml(webAddr) getImg(htmlText) GraspList = regFind(r '.*?', htmlText) try: webAddr = 'http://' + urlparse(webAddr).netloc + GraspList[1]#爬虫目标地址except Exception,
e: print "爬虫地址异常:",
Exception,
":",
e webAddr = 'http://' + urlparse(webAddr).netloc + GraspList[0]#爬虫目标地址#保存下一次的爬虫地址ConfigObj.SaveConfig(webAddr)

开车指南

初次上路，会出现下面的对话：

目前没有读到上次结束的地址输入一个url,
退出输入exit:

后面直接输入一个已知的目标页网址，后面会直接开始解析网页，至于网址嘛，为师只能指点你到这了

脚本里有个设置爬虫次数的语句，可按需修改：

deepth = 2      # 爬虫深度

如果一切正常，程序会将下一次脚本的起始网址保存在文件夹下的 store_html.ini 内，下载的内容保存在 \ GraphFile 目录下。

就爱阅读 www.92to.com 网友整理上传, 为您提供最全的知识大全, 期待您的分享，转载请注明出处。

来源:

与本文相关文章

暂无,快来抢沙发吧！