- # -*- coding: utf-8 -*-
- # Desc: Grab proxy ip
- # Date: 2014/06/13
- import os
- import urllib
- from util import *
- from bs4 import BeautifulSoup
- log = getUniqueLog()
- regions = ['China', 'America', 'Brazil', 'Japan', 'Twaiwan', 'Thailand', 'Bahrein']
- baseUrl = 'http://www.proxy360.cn/Region/'
- # if bypass in limit time return True
- def ping(ip):
- cmd = "ping -Q -c 1 -W 2000 %s 1>/dev/null 2>&1" % ip
- response = os.system(cmd)
- if response == 0:
- return True
- else:
- return False
- def filterIp(fname):
- result = []
- with open(fname) as fp:
- for line in fp:
- line = line.strip()
- if (line is None) or (line == ''):
- continue
- segments = line.split(' ')
- ip = segments[0]
- if ping(ip):
- result.append(ip)
- print result
- class Proxy():
- def __init__(self, address, port, hideprop, country, pubdate ):
- self.address = address
- self.port = port
- self.country = country
- self.pubdate = pubdate
- self.hideprop = hideprop
- def valid(self):
- return ping(self.address)
- def convertDict(self):
- return {'address':self.address, 'port':self.port, 'country':self.country, 'pubdate':self.pubdate, 'hideprop':self.hideprop}
- def initWithDict(proxyDict):
- return Proxy(proxyDict['address'], proxyDict['port'], proxyDict['country'], proxyDict['hideprop'], proxyDict['pubdate'])
- def __str__(self):
- # return self.address + self.port + self.country + self.hideprop + self.pubdate
- return '%s:%s %s %s %s' % (self.address, self.port, self.country, self.hideprop, self.pubdate)
- # proxy360
- def fetchProxies(region='China'):
- print '[Region: %s]' % region
- url = baseUrl + urllib.quote(region)
- print 'Fetching page ...'
- page = fetchPage(url)
- if page is None:
- print 'Fetch page failed'
- return
- print 'Analysising html ...'
- soup = BeautifulSoup(page)
- if soup is None:
- print 'parse html failed'
- return
- try:
- nodes = soup.find_all('div', class_ ='proxylistitem')
- validProxies = []
- total = len(nodes)
- cnt = 0
- validCnt = 0
- for node in nodes:
- cnt += 1
- print 'Dealing with (%d/%d:%d)th item ...' % (cnt, total, validCnt)
- proxyItems = node.find_all('span', class_ = 'tbBottomLine')
- proxy = Proxy(proxyItems[0].text.strip(), proxyItems[1].text.strip(), proxyItems[2].text.strip(), proxyItems[3].text.strip(), proxyItems[4].text.strip())
- print proxy
- if (proxy.valid()):
- validProxies.append(proxy.convertDict())
- validCnt += 1
- except Exception as e:
- print e
- finally:
- print 'Save proxies ...'
- if validProxies and (len(validProxies)>0):
- saveJson(validProxies, 'proxy_' + region.lower() + '.json')
- print 'Congratulation!'
- # end fetch proxy
- # proxy360.json
- def testProxies(fname):
- with open(fname) as fp:
- data = json.load(fp)
- result = []
- for item in data:
- try:
- ip = item['address']
- port = item['port']
- print ip
- socket = '%s:%s' % (ip, port)
- proxy_handler = urllib2.ProxyHandler({'http':socket})
- proxy_auth_handler = urllib2.ProxyBasicAuthHandler()
- opener = urllib2.build_opener(proxy_handler, proxy_auth_handler)
- print opener.open('http://20140507.ip138.com/ic.asp', timeout=3).read()
- result.append(ip)
- except Exception as e:
- #print e
- pass
- print result
- # bypass
- def checkProxy(ip, port):
- try:
- print ip, port
- socket = '%s:%s' % (ip, port)
- proxy_handler = urllib2.ProxyHandler({'http':socket})
- proxy_auth_handler = urllib2.ProxyBasicAuthHandler()
- opener = urllib2.build_opener(proxy_handler, proxy_auth_handler)
- #print opener.open('http://20140507.ip138.com/ic.asp', timeout=3).read()
- print opener.open('http://www.twitter.com', timeout=4).read()
- return True
- except Exception as e:
- return False
- return True
- # load from file: ip:port
- def loadProxies(fname):
- with open(fname) as fp:
- result = []
- for line in fp:
- line = line.strip()
- if line == '':
- continue
- segments = line.split(':')
- ip = segments[0]
- port = segments[1]
- if checkProxy(ip, port):
- result.append({ip:port})
- print result
- if __name__ == '__main__':
- timer = Timer()
- setupOpener()
- fetchProxies('America')
- #loadProxies('proxy2.txt')
- timer.stop()
- #该片段来自于http://www.codesnippet.cn/detail/190620149810.html
来源: http://www.codesnippet.cn/detail/190620149810.html