代理池
实现了简单的代理池免费 ip 的获取, 并使用有效的 ip, 进行爬取工作
- import requests
- import re
- from lxml import etree
- url = 'https://www.xicidaili.com/nn/'
- headers={
- "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) ApplewebKit/537.36 (Khtml, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
- }
- def get(proxies):
- url2 = 'https://movie.douban.com/top250'
- for i in range(0,250,25):
- payload = {'start': i}
- movie_response = requests.get(url=url2,headers=headers,proxies=proxies,params=payload).text
- tree = etree.HTML(movie_response)
- title.extend(tree.xpath("//div[@class='item']//a/span[1]/text()"))
- movie_url.extend(tree.xpath("//div[@class='item']//a/@href"))
- fen.extend(tree.xpath("//div[@class='star']//span[2]/text()"))
- ping.extend(tree.xpath("//div[@class='star']//span[4]/text()"))
- # 构建代理池
- def ip_run():
- ip_response = requests.get(url=url,headers=headers).text
- ips = re.findall("<td>(\d+\.\d+\.\d+\.\d+)</td>", ip_response, re.S)
- ports = re.findall("<td>(\d+)</td>", ip_response, re.S)
- for ip in(zip(ips,ports)):
- proxies = {
- "http":"http://"+ip[0]+":"+ip[1],
- "https":"http://"+ip[0]+":"+ip[1],
- }
- try:
- res = requests.get('http://www.baidu.com',proxies=proxies, timeout=2)
- print("ip 能使用")
- get(proxies)
- break
- except Exception as e:
- print("ip 不能使用")
- if __name__ == '__main__':
- title = []
- movie_url = []
- fen= []
- ping = []
- ip_run()
- jie = zip(title,movie_url,fen,ping)
- for i in jie:
- print(i)
来源: http://www.bubuko.com/infodetail-3358251.html