- #-*- coding:GB18030 -*-
- import sys
- import os
- import re
- import urllib.request
- import http.cookiejar
- import threading
- urllogin = 'http://bbs.artx.cn/logging.php?action=login&loginsubmit=yes&inajax=1'
- cj = http.cookiejar.CookieJar()
- #建立新的opener
- opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
- #装载新的opener
- urllib.request.install_opener(opener)
- #盛放PostData的字典
- postDict = {
- 'formhash' : '00e0e70f',
- 'referer' : 'http%3A%2F2Fguji.artx.cn%2F',
- 'loginfield' : 'username',
- 'username' : 'DEAGS3000',
- 'password' : '6c37c803ba32f5edd8d4bf49080bc0d8',
- 'questionid' : '0',
- 'answer' : '',
- 'cookietime' : '2592000',
- }
- #将字典中的PostData编译成url格式,再转码为UTF-8
- postData = urllib.parse.urlencode(postDict).encode('utf-8')
- #带着PostData访问登录页面
- resp = urllib.request.urlopen(urllogin, postData)
- html = resp.read()
- resp2 = urllib.request.urlopen('http://guji.artx.cn/', postData)
- def main():
- chooseop = input('请选择操作:\\n1.解析单本书籍介绍页面\\n2.解析书籍专题页面\\n3.退出程序\\n')
- if chooseop == '1':
- processurl(input('请输入要抓取的文章主页面的地址:\\n'),1)
- elif chooseop == '2':
- processsub(input('请输入要抓取的专题页面的地址:\\n'))
- elif chooseop == '3':
- sys.exit()
- #处理书籍介绍页
- def processurl(url,type):
- response = urllib.request.urlopen(url)
- html = response.read()
- #将html解码出来
- uhtml = html.decode('utf-8')
- #截取所有章节页面的URL
- urls = re.findall('(?<=<li><a href=\\").*\\.html(?=\\">)',uhtml)
- #截取所有章节标题
- titles = re.findall('(?<=\\.html\\">).*(?=</a></li>)',uhtml)
- #将章节标题里的空格占位符换为空格
- for i in titles:
- i = i.replace(' ',' ')
- i = i.replace('(','(')
- i = i.replace(')',')')
- #截取文章总标题
- titleinlist = re.findall('(?<=title"><h3>).*(?=</h3></div>)',uhtml)
- #截取文章所属的库
- kuinlist = re.findall('(?<=\\.html>).库(?=\\</a> )',uhtml)
- #截取专题名
- kindinlist = re.findall('(?<=showmain_kind_z>).*?(?=</a>)', uhtml)
- kind = kindinlist[0]
- ku = kuinlist[0]
- title = titleinlist[0]
- if len(urls) == len(titles):
- processurl2(url, '简介', title, ku, kind)
- if len(urls) < 5:
- for i in range(len(urls)):
- processurl2("http://guji.artx.cn" + urls[i], titles[i], title, ku, kind)
- if type == 1:
- main()
- else:
- t1 = ''
- t2 = ''
- t3 = ''
- t4 = ''
- num = len(urls)
- every = num // 4
- mod = num % 4
- #执行分段,分段
- urlsplit1 = urls[0:every]
- urlsplit2 = urls[every:every*2]
- urlsplit3 = urls[every*2:every*3]
- urlsplit4 = urls[every*3:every*4+mod]
- titlesplit1 = titles[0:every]
- titlesplit2 = titles[every:every*2]
- titlesplit3 = titles[every*2:every*3]
- titlesplit4 = titles[every*3:every*4+mod]
- print ("解析出的链接数和章节数相等,匹配正确!\\n")
- thread1 = Thread(1, 1, urlsplit1, titlesplit1, title, ku, kind)
- thread2 = Thread(2, 2, urlsplit2, titlesplit2, title, ku, kind)
- thread3 = Thread(3, 3, urlsplit3, titlesplit3, title, ku, kind)
- thread4 = Thread(4, 4, urlsplit4, titlesplit4, title, ku, kind)
- thread1.start()
- thread2.start()
- thread3.start()
- thread4.start()
- if type == 1:
- main()
- else:
- print ("解析出的章节数和链接数不相等,可能存在错误!\\n")
- #处理文本
- def text(i):
- #大致截取出正文文本
- text1 = re.findall('(?<=font-size:14px;\\">).*?(?=</div>)',i,re.DOTALL)
- #删除文本中的阅读笔记代码
- garbages1 = re.findall('<font class=bj_style>.*?</a></font>',text1[0],re.DOTALL)
- for g1 in garbages1:
- text1[0] = text1[0].replace(g1,'\\n ')
- #删除文本中的‘中国古籍全录’代码
- garbages2 = re.findall('<a href=.http.*?</a>',text1[0],re.DOTALL)
- for g2 in garbages2:
- text1[0] = text1[0].replace(g2,'')
- #删除文本中的<font class=***>
- garbages3 = re.findall('<font class=.*?>',text1[0],re.DOTALL)
- for g3 in garbages3:
- text1[0] = text1[0].replace(g3,'')
- #删除文本中的注释
- garbages4 = re.findall('<a href=.*?</a>',text1[0],re.DOTALL)
- for g4 in garbages4:
- text1[0] = text1[0].replace(g4,'')
- #删除文本中的</strong>
- text1[0] = text1[0].replace('</strong>','')
- #删除文本中的<strong>
- text1[0] = text1[0].replace('<strong>','')
- #删除文本中剩余的</font>
- text1[0] = text1[0].replace('</font>','')
- #删除文本中剩余的<br>
- text1[0] = text1[0].replace("<br>","")
- #删除文本中的空格占位符
- text1[0] = text1[0].replace(" ","")
- #把文本中的?替换为问号
- text1[0] = text1[0].replace("?","?")
- #把文本中的"替换为双引号
- text1[0] = text1[0].replace(""","\\"")
- return text1[0]
- #处理专题
- def processsub(url):
- response = urllib.request.urlopen(url)
- html = response.read()
- #将html解码出来
- uhtml = html.decode('utf-8')
- urls = re.findall('(?<=<a href=\\").*?html(?=\\" title=)',uhtml)
- titles = re.findall('(?<=\\.html\\" title=\\").*?(?=\\" target=_blank>)',uhtml,re.DOTALL)
- numt = len(titles)
- if numt == len(urls):
- print ('解析出的书籍数与链接数相等,匹配正确!\\n')
- #删除书名中的乱码
- for i in titles:
- i = i.replace(' ',' ')
- i = i.replace('(','(')
- i = i.replace(')',')')
- subinlist = re.findall('(?<=html">).{2,10}(?=</a></div>)',uhtml)
- print ('您要下载的专题是:\\n',subinlist[0],'\\n其中的书籍有:\\n',titles)
- global thread1
- global thread2
- global thread3
- global thread4
- for i in urls:
- do = processurl(i,2)
- #while thread1.isAlive == False and thread2.isAlive == False and thread3.isAlive == False and thread4.isAlive == False:
- #continue
- else:
- print ('解析出的书籍数和链接数不相等,可能存在错误!\\n')
- #多线程对象
- class Thread(threading.Thread):
- def __init__(self, num, interval, urlsplit, titlesplit, title, ku, kind):
- threading.Thread.__init__(self)
- self.thread_num = num
- self.interval = interval
- self.thread_stop = False
- self.urlsplit = urlsplit
- self.titlesplit = titlesplit
- self.title = title
- self.ku = ku
- self.kind = kind
- #多线程重复调用processurl2
- def run(self):
- while self.thread_stop == False:
- for i in range(len(self.urlsplit)):
- url1 = self.urlsplit[i]
- title1 = self.titlesplit[i]
- processurl2("http://guji.artx.cn" + url1, title1, self.title, self.ku, self.kind)
- self.stop()
- def stop(self):
- self.thread_stop = True
- #处理子页面,urls是url,titles是章节名,title是书的总标题,ku书籍所属库,kind是专题名
- def processurl2(urls, titles, title, ku, kind):
- #try:
- response1 = urllib.request.urlopen(urls)
- html1 = response1.read()
- uhtml1 = html1.decode('utf-8')
- #判断以库名和书名命名的文件夹是否存在,若不存在则创建
- if os.path.exists('E:/downloadedbooks/' + ku + '/' + kind + '/' + title) == False:
- os.makedirs('E:/downloadedbooks/' + ku + '/' + kind + '/' + title)
- else:
- pass
- #获取文章内容
- article = text(uhtml1)
- #在目录下以书名为文件名,以GB18030为默认编码创建TXT并写入内容
- f = open('E:/downloadedbooks/' + ku + '/' + kind + '/' + title + '/' + titles + '.txt','w',encoding='GB18030')
- f.write(str(article))
- f.close()
- print (titles, '.........下载完成.')
- #except:
- #print('本章出现异常,请手工处理!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
- main()
- #该片段来自于http://www.codesnippet.cn/detail/250220148803.html
来源: http://www.codesnippet.cn/detail/250220148803.html