1. 抓包
2. 代码
抓取:
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- #author tom
- import requests
- from multiprocessing import Queue
- from handle_pymongo import mongo
- from concurrent.futures import ThreadPoolExecutor
- class Douguo():
- def __init__(self):
- self.queue_list=Queue()
- #因为所有的请求的请求头都一样, 所以放在这边处理, 其实请求头也可以放在__init__
- def handle_request(self,url,data):
- headers={
- "client":"4",
- "version":"6920.4",
- "device":"SM-G9350",
- "sdk":"22,5.1.1",
- "imei":"861373280750547",
- "channel":"qqkp",
- # "mac":"2c:c3:82:e2:0b:03",
- "resolution":"1024*576",
- "dpi":"1.19375",
- # "android-id":"4014041355524873",
- # "pseudo-id":"28075263",
- "brand" :"samsung",
- "scale" :"1.19375",
- "timezone":"28800",
- "language":"zh",
- "cns":"3",
- # "imsi":"460005263415341",
- "user-agent":"Mozilla/5.0 (Linux; Android 5.1.1; SM-G9350 Build/LMY48Z) ApplewebKit/537.36 (Khtml, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36",
- "reach" :"1",
- "newbie":"1",
- "Content-Type":"application/x-www-form-urlencoded; charset=utf-8",
- "Accept-Encoding":"gzip, deflate",
- # "Cookie":"duid=59758072", #能不带最好不带
- "Host":"api.douguo.net",
- # "Content":"-Length 68",
- "Connection":"keep-alive"
- }
- response = requests.post(url=url, headers=headers,data=data)
- return response
- #请求食谱首页
- def handle_index(self):
- url='http://api.douguo.net/recipe/flatcatalogs'
- data={
- "client":"4",
- # "_session":"1557318413116861373280750547",
- # "v":"1503650468",
- "_vs": "2305"
- }
- response_index=self.handle_request(url,data)
- indext_response_dict=response_index.JSON()
- #遍历这个三级分类
- for index_item in indext_response_dict['result']['cs']:
- for items in index_item['cs']:
- for item in items['cs']:
- #每一个小分类
- data2={
- "client": "4",
- # "_session": "1557318413116861373280750547",
- "keyword":item['name'],
- "order": "3",
- "_vs": "400"
- }
- self.queue_list.put(data2)
- #请求具体食材的做法
- def handle_caipu_list(self,data):
- print("当前处理的食材是:",data['keyword'])
- #翻页
- for i in range(1,11):
- caipu_list_url='http://api.douguo.net/recipe/v2/search/{0}/20'.format(str(i*20))
- caipu_list_response=self.handle_request(url=caipu_list_url,data=data)
- caipu_list_dict=caipu_list_response.JSON()
- #判断是否有数据
- if caipu_list_dict['result']['end']==1:
- break
- for item in caipu_list_dict['result']['list']:
- caipu_info={}
- caipu_info['shicai'] = data['keyword']
- if item['type']==13:
- caipu_info['user_name']=item['r']['an']
- caipu_info['shicai_id']=item['r']['id']
- caipu_info['describe']=item['r']['cookstory'].replace('/n','').replace(' ','')
- caipu_info['caipu_name']=item['r']['n']
- caipu_info['zuoliao_list']=item['r']['major']
- # print(caipu_info)
- #获取详情页的内容
- detail_url='http://api.douguo.net/recipe/detail/'+str(caipu_info['shicai_id'])
- detail_data={
- "client": "4",
- # "_session": "1557318413116861373280750547",
- "author_id": "0",
- "_vs": "2803",
- "_ext":'{"query": {"kw":'+caipu_info['shicai']+',"src":"2803","idx":"1","type":"13","id":'+str(caipu_info['shicai_id'])+'}}'
- }
- detail_response=self.handle_request(url=detail_url,data=detail_data)
- detail_response_dict=detail_response.JSON()
- caipu_info['tips']=detail_response_dict['result']['recipe']['tips']
- caipu_info['cook_step']=detail_response_dict['result']['recipe']['cookstep']
- print('当前入库的是:',caipu_info['caipu_name'])
- mongo.inset_item(caipu_info)
- #g 过滤掉广告
- else:
- continue
- if __name__ == '__main__':
- d=Douguo()
- d.handle_index()
- pool=ThreadPoolExecutor(max_workers=20)
- while d.queue_list.qsize()>0:
- pool.submit(d.handle_caipu_list,d.queue_list.get())
存储:
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- #author tom
- import pymongo
- class Connect_Mongo(object):
- def __init__(self):
- self.client=pymongo.MongoClient(host='127.0.0.1',port=27017)
- self.db=self.client['doukou_meishi']
- def inset_item(self,item):
- self.collection=self.db['meishi']
- self.collection.insert(item)
- mongo=Connect_Mongo()
来源: http://www.bubuko.com/infodetail-3052514.html