爬虫请求相关

urllib.request
import urllib.request
url = 'http://www.baidu.com/'
response = urllib.request.urlopen(url).read()
# 使用 urllib.request.urlopen 可以读取网页内容
import urllib.request
url = 'https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1543822283&di=b327e6e2dc59105bcb73a174bff94919&imgtype=jpg&er=1&src=http://tupian.qqjay.com/u/2017/1201/2_161641_2.jpg'
res = urllib.request.urlopen(url)
# 图片下载方式一
with open('fengjing.jpg','wb') as f:
    f.write(res.read())
# 图片下载方式二
urllib.request.urlretrieve(url,'tupian.jpg')
# 使用 urllib.request.urlretrieve 可以将图片直接下载到指定的路径
import urllib.request
url = 'http://www.baidu.com/'
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ApplewebKit/537.36 (Khtml, like Gecko) Chrome/69.0.3497.92 Safari/537.36',
    'Accept-Language':'zh-CN,zh;q=0.9'
}
# 构建请求对象
request = urllib.request.Request(url=url,headers=headers)
# 使用 urllib.request.Request 可以构建带请求头的请求对象
import urllib.request
handler = urllib.request.ProxyHandler({'http': '124.243.226.18:8888'})
opener = urllib.request.build_opener(handler)
url = 'http://www.baidu.com/s?wd=IP'
headers = {
    "Host": "www.baidu.com",
    "Connection": "keep-alive",
    "Cache-Control": "max-age=0",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36",
    "Accept-Language": "zh-CN,zh;q=0.8",
}
request = urllib.request.Request(url=url,headers=headers)
response = opener.open(request)
print(response.read().decode())
# 给程序设置代理
urllib.parse
import urllib.parse
res = urllib.parse.urlparse(url)
print(res)
# 输出结果:
ParseResult(scheme='http', netloc='www.baidu.com', path='/s', params='', query='wd=知乎', fragment='')
# 使用 urllib.parse.urlparse 可以分割 url
import urllib.parse
Word = '知乎'
res = urllib.parse.quote(Word)
print(res)
res = urllib.parse.unquote(res)
print(res)
# 输出结果:
%E7%9F%A5%E4%B9%8E

知乎

# 使用 urllib.parse.quote 和 urllib.parse.unquote 可以对字符进行编码
import urllib.parse
params = {
    'wd':'知乎'
}
Word = urllib.parse.urlencode(params)
print(Word)
# 输出结果:
wd=%E7%9F%A5%E4%B9%8E
# 使用 urllib.parse.urlencode 可以把一个字典编码成以上格式
requests

import requests
response = requests.get('http://www.baidu.com/')
print(response.url)
print(response.text)
print(response.status_code)
print(response.headers)
print(response.cookies)
print(response.content.decode())

基本方法

http://www.baidu.com/

<!DOCTYPE HTML>
<!--STATUS OK-->
<HTML>
  
  <head>
    <meta http-equiv=content-type content=text/HTML;charset=utf-8>
    <meta http-equiv=X-UA-Compatible content=IE=Edge>
    <meta content=always name=referrer>
    <link rel=stylesheet type=text/CSS href=http://s1.bdstatic.com/r/www/cache/bdorz/baidu.min.css>
    <title>
      ç?¾åº¦ä¸?ä¸?ï¼?ä½ å°±ç?¥é??
    </title>
  </head>
  
  <body link=#0000cc>
    <div id=wrapper>
      <div id=head>
        <div class=head_wrapper>
          <div class=s_form>
            <div class=s_form_wrapper>
              <div id=lg>
                <img hidefocus=true src=//www.baidu.com/img/bd_logo1.PNG width=270 height=129>
              </div>
              <form id=form name=f action=//www.baidu.com/s class=fm>
                <input type=hidden name=bdorz_come value=1>
                <input type=hidden name=IE value=utf-8>
                <input type=hidden name=f value=8>
                <input type=hidden name=rsv_bp value=1>
                <input type=hidden name=rsv_idx value=1>
                <input type=hidden name=tn value=baidu>
                <span class="bg s_ipt_wr">
                  <input id=kw name=wd class=s_ipt value maxlength=255 autocomplete=off
                  autofocus>
                </span>
                <span class="bg s_btn_wr">
                  <input type=submit id=su value=ç?¾åº¦ä¸?ä¸? class="bg s_btn">
                </span>
              </form>
            </div>
          </div>
          <div id=u1>
            <a href=http://news.baidu.com name=tj_trnews class=mnav>
              æ?°é?»
            </a>
            <a href=http://www.hao123.com name=tj_trhao123 class=mnav>
              hao123
            </a>
            <a href=http://map.baidu.com name=tj_trmap class=mnav>
              å?°å?¾
            </a>
            <a href=http://v.baidu.com name=tj_trvideo class=mnav>
              è§?é¢?
            </a>
            <a href=http://tieba.baidu.com name=tj_trtieba class=mnav>
              è´´å?§
            </a>
            <noscript>
              <a href=http://www.baidu.com/bdorz/login.gif?login&tpl=mn&u=http://www.baidu.com/?bdorz_come=1
              name=tj_login class=lb>
                ç?»å½?
              </a>
            </noscript>
            <script>
              document.write('<a href="http://www.baidu.com/bdorz/login.gif?login&tpl=mn&u=' + encodeURIComponent(Windows.location.href + (Windows.location.search === "" ? "?": "&") + "bdorz_come=1") + '"name="tj_login"class="lb">ç?»å½?</a>');
            </script>
            <a href=//www.baidu.com/more/ name=tj_briicon class=bri style="display: block;">
              æ?´å¤?äº§å??
            </a>
          </div>
        </div>
      </div>
      <div id=ftCon>
        <div id=ftConw>
          <p id=lh>
            <a href=http://home.baidu.com>
              å?³äº?ç?¾åº¦
            </a>
            <a href=http://ir.baidu.com>
              About Baidu
            </a>
          </p>
          <p id=cp>
            ©2017 Baidu 
            <a href=http://www.baidu.com/duty/>
            ä½¿ç?¨ç?¾åº¦å??å¿?è¯»
            </a>
             
            <a href=http://jianyi.baidu.com/ class=cp-feedback>
              æ??è§?å??é¦?
            </a>
             äº¬ICPè¯?030173å?. 
            <img src=//www.baidu.com/img/gs.gif>
          </p>
        </div>
      </div>
    </div>
  </body>
 
</HTML>
200 {'Cache-Control': 'private, no-cache, no-store, proxy-revalidate,
no-transform', 'Connection': 'Keep-Alive', 'Content-Encoding': 'gzip',
'Content-Type': 'text/html', 'Date': 'Mon, 03 Dec 2018 13:39:39 GMT', 'Last-Modified':
'Mon, 23 Jan 2017 13:27:32 GMT', 'Pragma': 'no-cache', 'Server': 'bfe/1.0.8.18',
'Set-Cookie': 'BDORZ=27315; max-age=86400; domain=.baidu.com; path=/',
'Transfer-Encoding': 'chunked'}
<RequestsCookieJar[<Cookie BDORZ=27315 for .baidu.com/>
]>

<!DOCTYPE HTML>

<HTML> <head><meta http-equiv=content-type content=text/HTML;charset=utf-8><meta http-equiv=X-UA-Compatible content=IE=Edge><meta content=always name=referrer><link rel=stylesheet type=text/CSS href=http://s1.bdstatic.com/r/www/cache/bdorz/baidu.min.css><title > 百度一下, 你就知道 </title></head> <body link=#0000cc> <div id=wrapper> <div id=head> <div class=head_wrapper> <div class=s_form> <div class=s_form_wrapper> <div id=lg> <img hidefocus=true src=//www.baidu.com/img/bd_logo1.PNG width=270 height=129> </div> <form id=form name=f action=//www.baidu.com/s class=fm> <input type=hidden name=bdorz_come value=1> <input type=hidden name=IE value=utf-8> <input type=hidden name=f value=8> <input type=hidden name=rsv_bp value=1> <input type=hidden name=rsv_idx value=1> <input type=hidden name=tn value=baidu><span class="bg s_ipt_wr"><input id=kw name=wd class=s_ipt value maxlength=255 autocomplete=off autofocus></span><span class="bg s_btn_wr"><input type=submit id=su value = 百度一下 class="bg s_btn"></span> </form> </div> </div> <div id=u1> <a href=http://news.baidu.com name=tj_trnews class=mnav > 新闻</a> <a href=http://www.hao123.com name=tj_trhao123 class=mnav>hao123</a> <a href=http://map.baidu.com name=tj_trmap class=mnav > 地图</a> <a href=http://v.baidu.com name=tj_trvideo class=mnav > 视频</a> <a href=http://tieba.baidu.com name=tj_trtieba class=mnav > 贴吧</a> <noscript> <a href=http://www.baidu.com/bdorz/login.gif?login&tpl=mn&u=http://www.baidu.com/?bdorz_come=1 name=tj_login class=lb > 登录</a> </noscript> <script>document.write('<a href="http://www.baidu.com/bdorz/login.gif?login&tpl=mn&u='+ encodeURIComponent(Windows.location.href+ (Windows.location.search ==="" ? "?" : "&")+ "bdorz_come=1")+ '"name="tj_login"class="lb"> 登录 </a>');</script> <a href=//www.baidu.com/more/ name=tj_briicon class=bri style="display: block;"> 更多产品 </a> </div> </div> </div> <div id=ftCon> <div id=ftConw> <p id=lh> <a href=http://home.baidu.com > 关于百度</a> <a href=http://ir.baidu.com>About Baidu</a> </p> <p id=cp>©2017 Baidu <a href=http://www.baidu.com/duty/> 使用百度前必读</a> <a href=http://jianyi.baidu.com/ class=cp-feedback > 意见反馈</a> 京 ICP 证 030173 号 & nbsp; <img src=//www.baidu.com/img/gs.gif> </p> </div> </div> </div> </body> </HTML>

运行结果

'''
 目标网站:
    http://www.shicimingju.com/book/sanguoyanyi.html
 代码思路:
    1, 确定 url
    2, 伪装浏览器信息
    3, 获取 request
    4, 获取 url 指向页面的内容
    5, 解析获取到的网页, 获取标题标题和文章链接
    6, 下载文章内容到本地
'''
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup
import requests
import lxml
def handle_requests(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36',
    }
    response = requests.get(url,headers)
    return response
def download_text(title, href):
    response = handle_requests(href)
    content = response.text
    soup = BeautifulSoup(content,'lxml')
    res = soup.find('div',class_ = 'chapter_content').find_all('p')
    f = open('hongloumeng.txt', 'a',encoding='utf-8')
    f.write(title)
    print(title)
    for p in res:
        f.write(p.text)
    f.close()
def parse_content(content):
    soup = BeautifulSoup(content,'lxml')
    res = soup.select('.book-mulu> ul> li> a')
    for i in res:
        title = i.text
        href = 'http://www.shicimingju.com' + i['href']
        download_text(title,href)
        print('正在下载...')
def main():
    url = 'http://www.shicimingju.com/book/hongloumeng.html'
    # 获取包装头信息, 获取 request 对象
    response = handle_requests(url)
    # 根据得到的 request 发送请求, 获取页面所有内容
    content = response.text
    # 解析页面内容, 获取文章标题和内容链接
    parse_content(content)
if __name__ == '__main__':
    main()

requests 爬取小说

import requests
url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
data = {
    'cname':'',
    'pid':'',
    'keyword':'杭州',
    'pageIndex':'1',
    'pageSize':'10',
}
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36',
}
response = requests.post(url,data,headers=headers)
print(response.text)

requests 的 post 请求

selenium

selenium 最初是一个测试工具, 而爬虫中使用它主要是为了解决 requests 无法直接执行 JavaScript 代码的问题

selenium 本质是通过驱动浏览器, 完全模拟浏览器的操作, 比如跳转, 输入, 点击, 下拉等, 来拿到网页渲染之后的结果, 可支持多种浏览器

import requests
from selenium import webdriver
import time
from bs4 import BeautifulSoup
path = r'D:\Python1806\spider\day5\chromedriver.exe'
url = 'https://so.gushiwen.org/user/login.aspx'
browser = webdriver.Chrome(executable_path=path)
# 指定 url
browser.get(url)
time.sleep(2)
# 找到用户名和密码等输入项
username = browser.find_element_by_xpath('//input[@id="email"]')
username.send_keys('wusir666666@163.com')
time.sleep(1)
pwd = browser.find_element_by_xpath("//input[@id='pwd']")
pwd.send_keys('ymmnxhwm13579')
time.sleep(1)
soup = BeautifulSoup(browser.page_source,'lxml')
browser.save_screenshot('login.png')
code = input("请输入验证码:")
checkcode = browser.find_element_by_xpath("//input[@id='code']")
checkcode.send_keys(code)
login = browser.find_element_by_xpath("//input[@id='denglu']")
login.click()

selenium 基础使用

来源: http://www.bubuko.com/infodetail-2871589.html

与本文相关文章

暂无,快来抢沙发吧！