目录
随机 User-Agent
获取代理 ip
检测代理 ip 可用性
随机 User-Agent
fake_useragent 库, 伪装请求头
- from fake_useragent import UserAgent
- ua = UserAgent()
- # IE 浏览器的 user agent
- print(ua.IE)
- # opera 浏览器
- print(ua.opera)
- # Chrome 浏览器
- print(ua.Chrome)
- # Firefox 浏览器
- print(ua.Firefox)
- # safri 浏览器
- print(ua.Safari)
- # 最常用的方式
- # 写爬虫最实用的是可以随意变换 headers, 一定要有随机性. 支持随机生成请求头
- print(ua.random)
- print(ua.random)
- print(ua.random)
获取代理 ip
在免费的代理网站爬取代理 ip, 免费代理的采集也很简单, 无非就是: 访问页面页面 -> 正则 / xpath 提取 -> 保存
代理 ip 网站
有代理: https://www.youdaili.net/Daili/guonei/
66 代理: http://www.66ip.cn/6.html
西刺代理: https://www.xicidaili.com/
快代理: https://www.kuaidaili.com/free/
- # 根据网页结果, 适用正则表达式匹配
- # 这种方法适合翻页的网页
- import re
- import requests
- import time
- def get_ip():
- url='https://www.kuaidaili.com/free/inha/'
- url_list=[url+str(i+1) for i in range(5)] #生成 url 列表, 5 代表只爬取 5 页
- print(url_list)
- ip_list = []
- for i in range(len(url_list)):
- url =url_list[i]
- HTML = requests.get(url=url,).text
- regip = '<td.*?>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>.*?<td.*?>(\d{1,5})</td>'
- matcher = re.compile(regip,re.S)
- ipstr = re.findall(matcher, HTML)
- time.sleep(1)
- for j in ipstr:
- ip_list.append(j[0]+':'+j[1]) #ip+port
- print(ip_list)
- print('共收集到 %d 个代理 ip' % len(ip_list))
- return ip_list
- if __name__=='__main__':
- get_ip()
- # 先获取特定标签
- # 解析
- import requests
- from bs4 import BeautifulSoup
- def get_ip_list(obj):
- ip_text = obj.findAll('tr', {'class': 'odd'}) # 获取带有 IP 地址的表格的所有行
- ip_list = []
- for i in range(len(ip_text)):
- ip_tag = ip_text[i].findAll('td')
- ip_port = ip_tag[1].get_text() + ':' + ip_tag[2].get_text() # 提取出 IP 地址和端口号
- ip_list.append(ip_port)
- print("共收集到了 {} 个代理 IP".format(len(ip_list)))
- print(ip_list)
- return ip_list
- url = 'http://www.xicidaili.com/'
- headers = {
- 'User-Agent': 'User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) ApplewebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36'}
- request = requests.get(url, headers=headers)
- response =request.text
- bsObj = BeautifulSoup(response, 'lxml') # 解析获取到的 HTML
- lists=get_ip_list(bsObj)
检测代理 ip 可用性
第一种方法: 通过返回的状态码判断
- import requests
- import random
- import re
- import time
- def get_ip():
- url='https://www.kuaidaili.com/free/inha/'
- url_list=[url+str(i+1) for i in range(1)]
- print(url_list)
- ip_list = []
- for i in range(len(url_list)):
- url =url_list[i]
- HTML = requests.get(url=url,).text
- regip = '<td.*?>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>.*?<td.*?>(\d{1,5})</td>'
- matcher = re.compile(regip,re.S)
- ipstr = re.findall(matcher, HTML)
- time.sleep(1)
- for j in ipstr:
- ip_list.append(j[0]+':'+j[1])
- print('共收集到 %d 个代理 ip' % len(ip_list))
- print(ip_list)
- return ip_list
- def valVer(proxys):
- badNum = 0
- goodNum = 0
- good=[]
- for proxy in proxys:
- try:
- proxy_host = proxy
- protocol = 'https' if 'https' in proxy_host else 'http'
- proxies = {protocol: proxy_host}
- print('现在正在测试的 IP:',proxies)
- response = requests.get('http://www.baidu.com', proxies=proxies, timeout=2)
- if response.status_code != 200:
- badNum += 1
- print (proxy_host, 'bad proxy')
- else:
- goodNum += 1
- good.append(proxies)
- print (proxy_host, 'success proxy')
- except Exception as e:
- print( e)
- # print proxy_host, 'bad proxy'
- badNum += 1
- continue
- print ('success proxy num :', goodNum)
- print( 'bad proxy num :', badNum)
- print(good)
- if __name__ == '__main__':
- ip_list=get_ip()
- valVer(ip_list)
第二种方法: 使用 requests 包来进行验证
- import requests
- import random
- import re
- import time
- def get_ip():
- url='https://www.kuaidaili.com/free/inha/'
- url_list=[url+str(i+1) for i in range(1)]
- print(url_list)
- ip_list = []
- for i in range(len(url_list)):
- url =url_list[i]
- HTML = requests.get(url=url,).text
- regip = '<td.*?>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>.*?<td.*?>(\d{1,5})</td>'
- matcher = re.compile(regip,re.S)
- ipstr = re.findall(matcher, HTML)
- time.sleep(1)
- for j in ipstr:
- ip_list.append(j[0]+':'+j[1])
- print(ip_list)
- print('共收集到 %d 个代理 ip' % len(ip_list))
- return ip_list
- def valVer(proxys):
- badNum = 0
- goodNum = 0
- good=[]
- for proxy in proxys:
- print("现在正在检测 ip",proxy)
- try:
- requests.get('http://wenshu.court.gov.cn/', proxies={"http":"http://"+str(proxy)}, timeout=2)
- except:
- badNum+=1
- print('connect failed')
- else:
- goodNum=1
- good.append(proxy)
- print('success')
- print ('success proxy num :', goodNum)
- print( 'bad proxy num :', badNum)
- print(good)
- if __name__ == '__main__':
- ip_list=get_ip()
- valVer(ip_list)
第三种方法: 使用 telnet
- import telnetlib
- try:
- telnetlib.Telnet('127.0.0.1', port='80', timeout=20)
- except:
- print 'connect failed'
- else:
- print 'success'
来源: https://www.cnblogs.com/-wenli/p/10211942.html