设置随机请求头和 ip 代理池
middlewares.py 代码如下:
- import JSON,random
- import requests
- from useragent_randomchange.models import ProxyModel
- from twisted.internet.defer import DeferredLock
- class UseragentRandomchangeDownloaderMiddleware(object):
- USER_AGENTS=[
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ApplewebKit/537.36 (Khtml, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
- 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/44.0.2403.155 Safari/537.36',
- 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36',
- 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36',
- 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36']
- def process_request(self, request, spider):
- user_agent=random.choice(self.USER_AGENTS)
- request.headers['User-Agent']=user_agent
- class IPProxyRandomchangeDownloaderMiddleware(object):
- PROXY_URL='xxxxxxxxx' #接口 URL
- def __init__(self):
- super(IPProxyRandomchangeDownloaderMiddleware,self).__init__()
- self.current_proxy=None
- self.lock=DeferredLock() #定义一把锁
- def process_request(self, request, spider):
- if 'proxy' not in requests.meta or self.current_proxy.is_expiring:
- #请求代理
- self.update_proxy()
- request.meta['proxy']=self.current_proxy.proxy
- def process_response(self,request,response,spider):
- if response.status !=200 or "captcha"in response.url:
- if not self.current_proxy.blacked:
- self.current_proxy.blacked=True
- print('%s 这个代理被加入黑名单了' % self.current_proxy.ip)
- self.update_proxy()
- #如果来到这里, 说明这个请求已经被 boss 直聘识别为爬虫了
- #所以这个请求就相当于说明都没有获取到
- #如果不返回 request, 那么这个 request 就相当于没有获取到数据
- #也就是说, 这个请求就被废掉了, 这个数据就没有被抓取到
- #所有要重新返回 request, 让这个请求重新加入到调度中
- #下次再发送
- return request
- #如果是正常的, 那么要记得返回 response
- #如果不返回, 那么这个 reponse 就不会被传到爬虫那里去
- #也就得不到解析
- return response
- def update_proxy(self):
- self.lock.acquire()
- if not self.current_proxy or self.current_proxy.is_expiring or self.current_proxy.blacked:
- response = requests.get(self.PROXY_URL)
- text = response.text
- print("重新获取了一个代理:",text)
- result = JSON.loads(text)
- if len(result['data'])> 0:
- data = result['data'][0]
- proxy_model = ProxyModel(data)
- self.current_proxy = proxy_model
- self.lock.release()
封装了一个 models.py
- from datetime import datetime,timedelta
- class ProxyModel(object):
- def __init__(self,data):
- self.ip=data['ip']
- self.port=data['port']
- self.expire_str=data['expire_time']
- self.blacked=False
- data_str,time_str=self.expire_str.split(" ")
- year,month,day=data_str.split("-")
- hour,minute,second=time_str.split(":")
- self.expire_time=datetime(year=int(year),month=int(month),day=int(day),hour=int(hour),minute=int(minute),second=int(second))
- #https://ip:port
- self.proxy="http://{}:{}".format(self.ip,self.port)
- @property
- def is_expiring(self):
- now=datetime.now()
- if(self.expire_time-now)<timedelta(seconds=5):
- return True
- else:
- return False
还需在 setting.py 中设置
- DOWNLOADER_MIDDLEWARES = {
- 'useragent_randomchange.middlewares.UseragentRandomchangeDownloaderMiddleware': 100,
- 'useragent_randomchange.middlewares.IPProxyRandomchangeDownloaderMiddleware': 200
- }
- DOWNLOAD_DELAY = 1
来源: http://www.bubuko.com/infodetail-3133754.html