- # _*_ coding: utf-8 _*_
- import requests
- import sys
- import warnings
- import re
- import urllib
- warnings.filterwarnings('ignore')# 消除警告信息
- reload(sys)
- sys.setdefaultencoding('utf8')# 更改环境编码。
- '''
- 简化后
- proxies={'http':'http://username::8080',
- 'https': 'https://username::8080'}
- url='https://www.telegeography_provider-profiles.com/products/global-internet-geography/provider-rankings/provider-profiles/index.html'
- base_url='https://www.telegeography_provider-profiles.com/login/login'
- data={
- 'username':'username',
- 'password':'password',
- 'service':'https://www.telegeography_provider-profiles.com/products/global-internet-geography/provider-rankings/provider-profiles/index.html'
- }
- req=requests.get(base_url,proxies=proxies,verify=False).content
- pattern=re.compile('(.*?)',re.S)
- a=re.findall(pattern,req)[0]
- data['lt']=re.findall('',a,re.S)[0]
- s=requests.session()
- re=s.post(base_url,data=data,proxies=proxies,verify=False)
- res=s.get(url,proxies=proxies,verify=False).content
- print res
- '''
- proxies = {'http':'http://username::8080',
- 'https':'https://username::8080'}
- data = dict(username='username', password='password')
- base_url ='https://www.telegeography_provider-profiles.com/products/global-internet-geography/provider-rankings/provider-profiles/index.html'
- s = requests.session()
- # 请求初始网页, 获取登陆所需要的字段。包括post的数据和登陆的时候的url.base_res = requests.get(base_url, proxies=proxies, verify=False)
- data['lt'] = re.findall(r'"([A-Z]{2}-\w.+?)" ', base_res.content)[0]
- #无脑正则,取form下的input须先去form下的内容,注意空格对()的影响
- # pattern=re.compile('<td id="submit-container">(.*?)</td>',re.S)
- # d= re.findall(pattern, base_res.content)[0]
- # data['lt']=re.findall(r'<input type="hidden" name="lt" value="(.*?)" id="lt"/>',d,re.S)[0]#注意标签结束的斜杠data['service'] = base_res.url.split('?service=')[1]
- # data['service'] ='https://www.telegeography.com/products/global-internet-geography/provider-rankings/provider-profiles/index.html'login_url = base_res.url.split('?service=')[0]
- # login_url = 'https://www.telegeography.com/login/login'
- # 由于redirect会自动加上域,所以禁止redirect,allow_redirects=False。随后自己构造redirect
- # redirect的地址在之前禁止redirect那次请求的header里面的Location字段。
- #post的request url实际为登录页面action,而session可自动处理cookies信息login_res = s.post(login_url, data=data, proxies=proxies, verify=False, allow_redirects=False)#allow_redirects,定向正确,则不需要
- printlogin_res.headers['Location']
- '''
- 不要以下代码也能实现
- # redir_url = urllib.unquote(login_res.headers['Location']) # unquote只是把url decode回基本字串,而不是url编码过的。
- # redir_res = s.get(redir_url, proxies=proxies, verify=False, allow_redirects=False)
- # print redir_res
- '''
- # 现在这个登陆状态s就可以直接取GET任何一个你要的网页了。
- # 得到的response就是GET的响应,同样用content来获取source
- # from lxml import etree
- # req=s.get(base_url,proxies=proxies, verify=False).content
- # html=etree.HTML(req)
- # urls=html.xpath('//*[@id="content"]/table/tbody/tr/td[1])
- # print urls
- # for i in urls:
- # url='https://www.telegeography.com/products/global-internet-geography/provider-rankings/provider-profiles/'+i
- # html=etree.HTML(s.get(url,proxies=proxies, verify=False).content)
- # arrays=html.xpath('//*[@id="content"]/div[2]/div[@class="broadband-customers landscape wide"]/table/tbody/tr')
- # print arrays
- # print 'True International Gateway (TIG)' in base_html
- # print base_res.url
- # print base_res.cookies
- # print base_res.headers
- # print 'XO Communications' in base_html
- # print '1299, 3301, 5518, 1759, 3308' in base_html
- # print 'Bosnia-Herzegovina' in base_html
来源: