爬取知乎 Python 中文社区信息, https://zhuanlan.zhihu.com/zimei
- import requests
- from urllib.parse import urlencode
- from pyquery import PyQuery as pq
- from pymongo import MongoClient
- import json
- import time
- base_url = 'https://www.zhihu.com/api/v4/columns/zimei/articles?limit=10&'
- headers = {
- 'authority': 'www.zhihu.com',
- 'referer': 'https://zhuanlan.zhihu.com/zimei',
- 'origin': 'https://zhuanlan.zhihu.com',
- 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) ApplewebKit/537.36 (Khtml, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
- }
- client = MongoClient()
- db = client['zhihu']
- collection = db['zhihu']
- max_page = 100
- def get_page(page):
- params = {
- 'offset': page*10
- }
- url = base_url + urlencode(params)
- try:
- response = requests.get(url, headers=headers)
- if response.status_code == 200:
- return response.json()
- except requests.ConnectionError as e:
- print('Error', e.args)
- def parse_page(json_1):
- if json_1:
- items = json_1.get('data')
- for item in items:
- if page == 1 :
- continue
- else:
- zhihu = {}
- zhihu['name'] = item.get('author').get('name')
- zhihu['title'] = item.get('title')
- zhihu['text'] = pq(item.get('excerpt')).text()
- zhihu['comments'] = item.get('comment_count')
- zhihu['reposts'] = item.get('voteup_count')
- zhihu['data'] = time.strftime('%Y-%m-%d %H%:%M',time.localtime(item.get('updated')))
- yield zhihu
- def write_to_file(content):
- with open('zhihu.json','a',encoding='utf-8') as f:
- f.write(json.dumps(content,ensure_ascii=False)+'\n')
- f.close()
- def save_to_mongo(result):
- if collection.insert(result):
- print('Saved to Mongo')
- if __name__ == '__main__':
- for page in range(1, max_page + 1):
- json_1 = get_page(page)
- results = parse_page(json_1)
- for result in results:
- print(result)
- write_to_file(result)
- save_to_mongo(result)
来源: http://www.bubuko.com/infodetail-2659777.html