bs 去除特定标签.
- # url
- import easygui as g
- import urllib.request
- from bs4 import BeautifulSoup
- import os
- import sys
- import re
- import config.story2 as urls
- # 获取 url
- def set_url():
- msg = "请填写一下信息 (其中带 * 号的项为必填项)"
- title = "爬虫练习"
- fieldNames = ["* 小说目录地址", "* 组装前半段", "后半段"]
- fieldValues = []
- fieldValues = g.multenterbox(msg, title, fieldNames)
- while True:
- if fieldValues == None:
- break
- errmsg = ""
- for i in range(len(fieldNames)):
- option = fieldNames[i].strip()
- if fieldValues[i].strip() == ""and option[0] =="*":
- errmsg += ("[%s] 为必填项" % fieldNames[i])
- if errmsg == "":
- break
- fieldValues = g.multenterbox(errmsg, title, fieldNames, fieldValues)
- return fieldValues
- # 下载网页内容, 找到文章标题和对应的下载路径
- def get_urls(seed_url,pre_url,last_url):
- # 保存文章名称和地址
- storyList = {}
- response = urllib.request.urlopen(seed_url)
- html = response.read().decode('utf-8')
- bs = BeautifulSoup(html, "html.parser")
- contents = bs.find_all("div", {"class": "c-line-bottom"})
- for each in contents:
- # 或者文章的 data-nsrc 属性
- nsrc = each.a["data-nsrc"]
- #组装 url
- seed_url = pre_url+nsrc+last_url
- # 获取文件标题
- title = each.p.string
- storyList[title] = seed_url
- return storyList
- # 获取每个小说并下载
- def getStory():
- savepath = r"E:\\stories\\"
- storyList = get_urls(urls.url1,urls.url2,urls.url3)
- storyNames = list(storyList.keys())
- for i in range(len(storyNames)):
- # 获取小说:
- html = urllib.request.urlopen(storyList[storyNames[i]]).read().decode('utf-8')
- bs = BeautifulSoup(html,"html.parser")
- [s.extract() for s in bs('br')] # 后来发现这个可以啊
- content = bs.find_all('p')
- #[ss.extract() for ss in content('p')] # 放到这里是否可以, 发现不行. TypeError: 'ResultSet' object is not callable
- # # 用替换方式去掉 br 修饰, 发现不行
- # oldstr = r'<br style="font-size:16px;font-weight:normal;' \
- # r'margin-left:4px;margin-right:4px;float:none;color:rgb(0, 0, 0);' \
- # r'text-align:-webkit-auto;text-indent:0px;white-space:normal;' \
- # r'text-overflow:clip;clear:none;display:inline;"/>'
- #
- # print(content)
- with open(savepath+storyNames[i]+".txt",'w') as f:
- f.writelines(str(content))
- # download(get_url())
- # get_url()
- getStory()
来源: http://www.bubuko.com/infodetail-2562420.html