不多说了, 上代码:
- from requests import request
- from bs4 import BeautifulSoup
- import re
- import pymongo
- class SpiderDouBan:
- def __init__(self):
- client = pymongo.MongoClient(host='localhost', port=27017)
- db = client['spider_db']
- self.collection = db['douban_movie_top250']
- def get_html(self, url):
- '''
- 获取一页的 html 文本
- :param url: 地址
- :return:
- ''' HTML = request('get', url).text
- soup = BeautifulSoup(HTML, 'lxml')
- return soup
- def get_one_page(self, soup, order):
- '''
- 获取某一页的内容
- :param soup: soup 实例化对象
- :return:
- ''' movie_names = [span.string for span in soup.find_all(name='span', attrs={'class':'title'}) if not re.search('\/', span.string)]
- movie_actors = [ re.sub('\n|\xa0', '', p.get_text().strip('" "|\n | \xa0')).split('/') for p in soup.find_all(name='p', attrs={'class': ''})]
- movie_rates = [span.string for span in soup.find_all(name='span', attrs={'class': 'rating_num'})]
- comment_num = [span_2.string for span in soup.find_all(attrs={'property': 'v:best'}) for span_2 in span.next_siblings if re.search('\w+', span_2.string)]
- short_comments = [re.sub('.', '', span.string) for span in soup.find_all(class_='inq')]
- for index, name in enumerate(movie_names):
- print(f'正在爬取第 {order + index + 1} 条数据...')
- movie_info = {
- 'order': f'No.{order + index + 1}',
- 'movie_name': name,
- 'movie_type': f'{re.findall("[0-9]+", movie_actors[index][-3])[0]}年 /{movie_actors[index][-2]}/{movie_actors[index][-1]}',
- 'movie_rate': f'{movie_rates[index][0]}分',
- 'short_comment': f'{short_comments[index]}'
- }
- self.collection.insert_one(movie_info)
- def main(self, url, order):
- '''
- 主程序
- :return:
- '''
- soup = self.get_html(url)
- self.get_one_page(soup, order)
- if __name__ == '__main__':
- for offset in range(0, 250, 25):
- order = offset
- url = f'https://movie.douban.com/top250?start={str(offset)}'
- SpiderDouBan().main(url, order)
运行结果:
MongoDB 存储效果:
来源: http://www.bubuko.com/infodetail-3101368.html