最近老是写 selenium 的爬虫, 想复习下 requests + BeautifulSoup 爬取网站内容
先写一下思路: 打开网站, 获取网站的首页显示的小说 --------------> 根据输入的内容来进行判断是否含有该小说, 有, 就对该小说进行访问 -------------> 打开含有小说目录的网页, 匹配章节名称和 URL----------> 循环获取文本内容, 并对内容进行清理, 写入文本文档
全部代码:
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- import requests
- import datetime
- from bs4 import BeautifulSoup
- import time
- def book_info():
- url = http://www.quanshuwang.com/
- headers = {User-Agent:Mozilla/5.0 (X11; Linux x86_64) ApplewebKit/537.36 (Khtml, like Gecko) Chrome/65.0.3325.162 Safari/537.36}
- html = requests.get(url,timeout=30,headers=headers)
- time.sleep(2)
- html.encoding = html.apparent_encoding
- soup = BeautifulSoup(html.text,html.parser)
- #获取热门书籍
- hot_list = soup.select(a.msgBorder)
- hot_book = {}
- for x in hot_list:
- hot_book[x[title]] = x[href]
- #print(hot_book)
- #获取好看的书籍:
- wonderful_list = soup.find_all(name=a,attrs={class:clearfix stitle})
- wonderful_book = {}
- for y in wonderful_list:
- wonderful_book[y[title]] = y[href]
- #print(len(wonderful_list))
- #添加到一个总的字典中
- book_dict = {}
- for k,v in hot_book.items():
- book_dict[k] = v
- for k,v in wonderful_book.items():
- book_dict[k] = v
- return book_dict
- #询问用户想看什么书
- def search_book(book_name,book_dict):
- if book_name in book_dict:
- return book_dict[book_name]
- else:
return 对不起, 您要查询的书籍没有找到
- #获取书籍的网址, 并访问
- def down_book(url_1):
- url = url_1
- headers = {User-Agent:Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36}
- html2 = requests.get(url,timeout=30,headers=headers)
- html2.encoding = html2.apparent_encoding
- soup2 = BeautifulSoup(html2.text,html.parser)
- #网页中有一个开始阅读的按钮, 需要对其链接进行访问, 然后才能进入目录界面
- read_url = soup2.select(a.reader)[0][href]
- html3 = requests.get(read_url,timeout=30,headers=headers)
- html3.encoding = html3.apparent_encoding
- soup3 = BeautifulSoup(html3.text,html.parser)
- info_list = soup3.select(div[class="clearfix dirconone"] a)
- catalog_dict = {}
- for x in info_list:
- catalog_dict[x[title]] = x[href]
- return catalog_dict
- #a = down_book(search_book(盗墓笔记, book_info()))
- #print(a)
- def write_book(book_name,dicts):
- headers = {
- User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36}
- with open(%s.txt % book_name,w+) as w_b:
- for k,v in dicts.items():
- w_b.write(\n\n\n%s \n\n\n % k)
- html4 = requests.get(v,timeout=30,headers=headers)
- html4.encoding = html4.apparent_encoding
- soup4 = BeautifulSoup(html4.text,html.parser)
- text_list = soup4.select(div.mainContenr)
- sss =
- for xabc in text_list:
- abcd = xabc.text.replace( ,).replace(style5();,).replace(style6();,)
- sss += abcd
- print(sss)
- w_b.write(sss)
- return w_b.close()
- start_time = datetime.datetime.now()
- bbb = input(请输入你要查询的书籍:)
if search_book(bbb,book_info()) != 对不起, 您要查询的书籍没有找到:
- a = down_book(search_book(bbb,book_info()))
- write_book(bbb,a)
- else:
- print(search_book(bbb,book_info()))
- end_time = datetime.datetime.now()
- cha = (end_time - start_time).seconds
- print(此次运行耗时 %s 秒. % cha)
代码中都进行了注释, 如果有不懂的地方, 请在文章下方进行评论
谢谢您的阅读!
----------------by sniper-huohuo -----------------
------------ 知耻而后勇 --------------
来源: http://www.bubuko.com/infodetail-2546502.html