- 1 import re
- 2 import urllib.request
- 3 frombs4import BeautifulSoup
- 4 import time
- 5
- 6url=input("第一页网址:")
- 7
- 8 def gethtml(url):
- 9 #获取页面源代码html
- 10page=urllib.request.urlopen(url)
- 11html=page.read().decode('utf-8')#html是一个列表
- 12soup=BeautifulSoup(html,'html.parser')
- 13
- 14 return soup
- 15
- 16
- 17 defgetbookurl(soup):#获取该页所有书本的链接地址
- 18firsturl2=[]
- 19bookurl=soup.find_all("h4")
- 20bookurl1=re.findall(r'<h4><a data-bid=".*?" data-eid=".*?" href="(.*?)" target="_blank"',str(bookurl))
- 21 foriin range(0,len(bookurl1)):
- 22bookurl="http:"+bookurl1[i]
- 23
- 24soup1=gethtml(bookurl)#获取每本书第一章 的url
- 25time.sleep(0.2)
- 26firsturl=soup1.find_all("a",{"class":"red-btn J-getJumpUrl "})
- 27firsturl1=re.findall(r'data-firstchapterjumpurl=".*?" href="(.*?)" id="readBtn">',str(firsturl))
- 28 iffirsturl1[0]=='':#这里要进行判断,防止出错
- 29 continue
- 30 firsturl2.append(firsturl1[0])
- 31 return firsturl2
- 32
- 33
- 34
- 35
- 36 def getcontent(soup,load):
- 37
- 38content=soup.find_all("div",{"class":"read-content j_readContent"})
- 39
- 40content1=re.compile(r'<p>([\s\S]*?)</p>')
- 41
- 42content2=content1.findall(str(content))
- 43
- 44content3=re.sub("</?\w+[^>]*>",'',content2[0])
- 45
- 46content4=content3.replace('。','。\n\n\0\0\0')#到此,将章节内容获取完毕
- 47
- 48contentname=re.compile(r'<h3 class="j_chapterName">(.*?)</h3>')
- 49
- 50contentname1=contentname.findall(str(soup))#获取章节名称
- 51
- 52book="----------------------------------------------------------------"+contentname1[0]+"------------------------------------------------------------\n\n\n"+content4
- 53
- 54with open(load,'a') as f:
- 55
- 56 f.write(book)
- 57
- 58
- 59
- 60 def nextcontent(soup):
- 61
- 62content=soup.find_all("div",{"class":"chapter-control dib-wrap"})
- 63
- 64 #print(str(content))
- 65
- 66step=re.compile(r'<a data-eid="qd_R109" href="(.*?)" id="j_chapterNext">')
- 67
- 68content1=step.findall(str(content))
- 69
- 70 ifcontent1 == []:
- 71
- 72step1=re.compile(r'<a data-eid="qd_R118" href="(.*?)" id="j_chapterNext">')
- 73
- 74content2=step1.findall(str(content))
- 75
- 76url="http:"+content2[0]
- 77
- 78 return url
- 79 else:
- 80url="http:"+content1[0]
- 81
- 82 return url
- 83
- 84 def panduan(soup):
- 85
- 86content=soup.find_all("div",{"class":"chapter-control dib-wrap"})
- 87
- 88 #print(str(content))
- 89
- 90step=re.compile(r'<a data-eid="qd_R109" href="(.*?)" id="j_chapterNext">')
- 91
- 92content1=step.findall(str(content))
- 93
- 94 return content1
- 95 #-------------------------------------------------------------------------
- 96
- 97
- 98
- 99 #-------------------------------------------------------------------------
- 100
- 101 while1==1:
- 102soup2=gethtml(url)
- 103firsturl2=getbookurl(soup2)
- 104
- 105 forjin range(0,len(firsturl2)):
- 106url="http:"+firsturl2[j]
- 107soup1=gethtml("http:"+firsturl2[j])
- 108bookname=re.findall(r'<h1>(.*?)</h1>' ,str(soup1))
- 109load="d:/88/%s.txt"% bookname[0]
- 110i=0
- 111 while1==1:
- 112soup=gethtml(url)
- 113 getcontent(soup,load)
- 114url=nextcontent(soup)
- 115content1=panduan(soup)
- 116i+=1117 print("第%d章下载完成"% i)
- 118
- 119 ifcontent1 == []:
- 120 break
- 121
- 122time.sleep(0.2)
- 123 print("-------------第%d本书下载完成---------"% int(j+1))
- 124
来源: http://www.bubuko.com/infodetail-2087310.html