抓取中国古籍全录网站的书籍，最新升级多线程v0.8.5

 
#-*- coding:GB18030 -*-
import sys
import os
import re
import urllib.request
import http.cookiejar
import threading
 
 
urllogin = 'http://bbs.artx.cn/logging.php?action=login&loginsubmit=yes&inajax=1'
cj = http.cookiejar.CookieJar()
#建立新的opener
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
#装载新的opener
urllib.request.install_opener(opener)
#盛放PostData的字典
postDict = {
    'formhash' : '00e0e70f',
    'referer' : 'http%3A%2F2Fguji.artx.cn%2F',
    'loginfield' : 'username',
    'username' : 'DEAGS3000',
    'password' : '6c37c803ba32f5edd8d4bf49080bc0d8',
    'questionid' : '0',
    'answer' : '',
    'cookietime' : '2592000',
            }
#将字典中的PostData编译成url格式，再转码为UTF-8
postData = urllib.parse.urlencode(postDict).encode('utf-8')
#带着PostData访问登录页面
resp = urllib.request.urlopen(urllogin, postData)
html = resp.read()
resp2 = urllib.request.urlopen('http://guji.artx.cn/', postData)
     
def main():
    chooseop = input('请选择操作:\\n1.解析单本书籍介绍页面\\n2.解析书籍专题页面\\n3.退出程序\\n')
    if chooseop == '1':
        processurl(input('请输入要抓取的文章主页面的地址:\\n'),1)
    elif chooseop == '2':
        processsub(input('请输入要抓取的专题页面的地址:\\n'))
    elif chooseop == '3':
        sys.exit()
#处理书籍介绍页
def processurl(url,type):
    response = urllib.request.urlopen(url)
    html = response.read()
    #将html解码出来
    uhtml = html.decode('utf-8')
    #截取所有章节页面的URL
    urls = re.findall('(?<=<li><a href=\\").*\\.html(?=\\">)',uhtml)
    #截取所有章节标题
    titles = re.findall('(?<=\\.html\\">).*(?=</a></li>)',uhtml)
    #将章节标题里的空格占位符换为空格
    for i in titles:
        i = i.replace(' ',' ')
        i = i.replace('(','(')
        i = i.replace(')',')')
    #截取文章总标题
    titleinlist = re.findall('(?<=title"><h3>).*(?=</h3></div>)',uhtml)
    #截取文章所属的库
    kuinlist = re.findall('(?<=\\.html>).库(?=\\</a> )',uhtml)
    #截取专题名
    kindinlist = re.findall('(?<=showmain_kind_z>).*?(?=</a>)', uhtml)
    kind = kindinlist[0]
    ku = kuinlist[0]
    title = titleinlist[0]
    if len(urls) == len(titles):
        processurl2(url, '简介', title, ku, kind)
        if len(urls) < 5:
            for i in range(len(urls)):
                processurl2("http://guji.artx.cn" + urls[i], titles[i], title, ku, kind)
            if type == 1:
                    main()
        else:
            t1 = ''
            t2 = ''
            t3 = ''
            t4 = ''
            num = len(urls)
            every = num // 4
            mod = num % 4
            #执行分段，分段
            urlsplit1 = urls[0:every]
            urlsplit2 = urls[every:every*2]
            urlsplit3 = urls[every*2:every*3]
            urlsplit4 = urls[every*3:every*4+mod]
            titlesplit1 = titles[0:every]
            titlesplit2 = titles[every:every*2]
            titlesplit3 = titles[every*2:every*3]
            titlesplit4 = titles[every*3:every*4+mod]
            print ("解析出的链接数和章节数相等，匹配正确!\\n")
            thread1 = Thread(1, 1, urlsplit1, titlesplit1, title, ku, kind)
            thread2 = Thread(2, 2, urlsplit2, titlesplit2, title, ku, kind)
            thread3 = Thread(3, 3, urlsplit3, titlesplit3, title, ku, kind)
            thread4 = Thread(4, 4, urlsplit4, titlesplit4, title, ku, kind)
            thread1.start()
            thread2.start()
            thread3.start()
            thread4.start()
            if type == 1:
                main()
 
    else:
        print ("解析出的章节数和链接数不相等，可能存在错误!\\n")  
 
#处理文本            
def text(i):
    #大致截取出正文文本
    text1 = re.findall('(?<=font-size:14px;\\">).*?(?=</div>)',i,re.DOTALL)
    #删除文本中的阅读笔记代码
    garbages1 = re.findall('<font class=bj_style>.*?</a></font>',text1[0],re.DOTALL)
    for g1 in garbages1:
        text1[0] = text1[0].replace(g1,'\\n  ')
    #删除文本中的‘中国古籍全录’代码
    garbages2 = re.findall('<a href=.http.*?</a>',text1[0],re.DOTALL)
    for g2 in garbages2:
        text1[0] = text1[0].replace(g2,'')
    #删除文本中的<font class=***>
    garbages3 = re.findall('<font class=.*?>',text1[0],re.DOTALL)
    for g3 in garbages3:
        text1[0] = text1[0].replace(g3,'')
    #删除文本中的注释
    garbages4 = re.findall('<a href=.*?</a>',text1[0],re.DOTALL)
    for g4 in garbages4:
        text1[0] = text1[0].replace(g4,'')
    #删除文本中的</strong>
    text1[0] = text1[0].replace('</strong>','')
    #删除文本中的<strong>
    text1[0] = text1[0].replace('<strong>','')
    #删除文本中剩余的</font>
    text1[0] = text1[0].replace('</font>','')
    #删除文本中剩余的<br>
    text1[0] = text1[0].replace("<br>","")
    #删除文本中的空格占位符
    text1[0] = text1[0].replace(" ","")
    #把文本中的?替换为问号
    text1[0] = text1[0].replace("?","?")
    #把文本中的"替换为双引号
    text1[0] = text1[0].replace(""","\\"")
    return text1[0]
#处理专题
def processsub(url):
    response = urllib.request.urlopen(url)
    html = response.read()
    #将html解码出来
    uhtml = html.decode('utf-8')
    urls = re.findall('(?<=<a href=\\").*?html(?=\\" title=)',uhtml)
    titles = re.findall('(?<=\\.html\\" title=\\").*?(?=\\" target=_blank>)',uhtml,re.DOTALL)
    numt = len(titles)
    if numt == len(urls):
        print ('解析出的书籍数与链接数相等,匹配正确!\\n')
        #删除书名中的乱码
        for i in titles:
            i = i.replace(' ',' ')
            i = i.replace('(','(')
            i = i.replace(')',')')
        subinlist = re.findall('(?<=html">).{2,10}(?=</a></div>)',uhtml)
        print ('您要下载的专题是:\\n',subinlist[0],'\\n其中的书籍有:\\n',titles)
        global thread1
        global thread2
        global thread3
        global thread4
        for i in urls:
            do = processurl(i,2)
            #while thread1.isAlive == False and thread2.isAlive == False and thread3.isAlive == False and thread4.isAlive == False:
                #continue
    else:
        print ('解析出的书籍数和链接数不相等,可能存在错误!\\n')
         
 
#多线程对象
class Thread(threading.Thread):
    def __init__(self, num, interval, urlsplit, titlesplit, title, ku, kind):
        threading.Thread.__init__(self)  
        self.thread_num = num
        self.interval = interval
        self.thread_stop = False
        self.urlsplit = urlsplit
        self.titlesplit = titlesplit
        self.title = title
        self.ku = ku
        self.kind = kind
    #多线程重复调用processurl2
    def run(self):
        while self.thread_stop == False:
             
            for i in range(len(self.urlsplit)):
                url1 = self.urlsplit[i]
                title1 = self.titlesplit[i]
                processurl2("http://guji.artx.cn" + url1, title1, self.title, self.ku, self.kind)
            self.stop()
             
    def stop(self):  
        self.thread_stop = True
 
         
 
#处理子页面，urls是url，titles是章节名，title是书的总标题，ku书籍所属库，kind是专题名
def processurl2(urls, titles, title, ku, kind):
    #try:
        response1 = urllib.request.urlopen(urls)
        html1 = response1.read()
        uhtml1 = html1.decode('utf-8')
        #判断以库名和书名命名的文件夹是否存在，若不存在则创建
        if os.path.exists('E:/downloadedbooks/' + ku + '/' + kind +  '/' + title) == False:
            os.makedirs('E:/downloadedbooks/' + ku + '/' + kind + '/' + title)
        else:
                pass
        #获取文章内容
        article = text(uhtml1)
        #在目录下以书名为文件名，以GB18030为默认编码创建TXT并写入内容
        f = open('E:/downloadedbooks/' + ku + '/' + kind + '/' + title + '/' + titles + '.txt','w',encoding='GB18030')
        f.write(str(article))
        f.close()
        print (titles, '.........下载完成.')
    #except:
        #print('本章出现异常，请手工处理!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
 
main()
#该片段来自于http://www.codesnippet.cn/detail/250220148803.html
来源: http://www.codesnippet.cn/detail/250220148803.html
与本文相关文章

多线程
暂无,快来抢沙发吧！