网络爬虫有什么用? 怎么爬? 手把手教你爬网页

本文主要分为两个部分: 一部分是网络爬虫的概述, 帮助大家详细了解网络爬虫; 另一部分是 HTTP 请求的 Python 实现, 帮助大家了解 Python 中实现 HTTP 请求的各种方式, 以便具备编写 HTTP 网络程序的能力.

01 网络爬虫概述

接下来从网络爬虫的概念, 用处与价值和结构等三个方面, 让大家对网络爬虫有一个基本的了解.

1. 网络爬虫及其应用

随着网络的迅速发展, 万维网成为大量信息的载体, 如何有效地提取并利用这些信息成为一个巨大的挑战, 网络爬虫应运而生. 网络爬虫(又被称为网页蜘蛛, 网络机器人), 是一种按照一定的规则, 自动地抓取万维网信息的程序或者脚本. 下面通过图 3-1 展示一下网络爬虫在互联网中起到的作用:

image

网络爬虫按照系统结构和实现技术, 大致可以分为以下几种类型: 通用网络爬虫, 聚焦网络爬虫, 增量式网络爬虫, 深层网络爬虫. 实际的网络爬虫系统通常是几种爬虫技术相结合实现的.

搜索引擎(Search Engine), 例如传统的通用搜索引擎 baidu,Yahoo 和 Google 等, 是一种大型复杂的网络爬虫, 属于通用性网络爬虫的范畴. 但是通用性搜索引擎存在着一定的局限性:

不同领域, 不同背景的用户往往具有不同的检索目的和需求, 通用搜索引擎所返回的结果包含大量用户不关心的网页.

通用搜索引擎的目标是尽可能大的网络覆盖率, 有限的搜索引擎服务器资源与无限的网络数据资源之间的矛盾将进一步加深.

万维网数据形式的丰富和网络技术的不断发展, 图片, 数据库, 音频, 视频多媒体等不同数据大量出现, 通用搜索引擎往往对这些信息含量密集且具有一定结构的数据无能为力, 不能很好地发现和获取.

通用搜索引擎大多提供基于关键字的检索, 难以支持根据语义信息提出的查询.

为了解决上述问题, 定向抓取相关网页资源的聚焦爬虫应运而生.

import urllib2
response=urllib2.urlopen('http://www.zhihu.com')
html=response.read()
print HTML

import urllib2
# 请求
request=urllib2.Request('http://www.zhihu.com')
# 响应
response = urllib2.urlopen(request)
HTML=response.read()
print HTML

import urllib
import urllib2
url = 'http://www.xxxxxx.com/login'
postdata = {'username' : 'qiye',
    'password' : 'qiye_pass'}
# info 需要被编码为 urllib2 能理解的格式, 这里用到的是 urllib
data = urllib.urlencode(postdata)
req = urllib2.Request(url, data)
response = urllib2.urlopen(req)
HTML = response.read()

import urllib
import urllib2
url = 'http://www.xxxxxx.com/login'
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
referer='http://www.xxxxxx.com/'
postdata = {'username' : 'qiye',
    'password' : 'qiye_pass'}
# 将 user_agent,referer 写入头信息
headers={'User-Agent':user_agent,'Referer':referer}
data = urllib.urlencode(postdata)
req = urllib2.Request(url, data,headers)
response = urllib2.urlopen(req)
HTML = response.read()

import urllib
import urllib2
url = 'http://www.xxxxxx.com/login'
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
referer='http://www.xxxxxx.com/'
postdata = {'username' : 'qiye',
    'password' : 'qiye_pass'}
data = urllib.urlencode(postdata)
req = urllib2.Request(url)
# 将 user_agent,referer 写入头信息
req.add_header('User-Agent',user_agent)
req.add_header('Referer',referer)
req.add_data(data)
response = urllib2.urlopen(req)
HTML = response.read()

import urllib2
import cookielib
cookie = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
response = opener.open('http://www.zhihu.com')
for item in cookie:
    print item.name+':'+item.value

import  urllib2
opener = urllib2.build_opener()
opener.addheaders.append( ( 'Cookie', 'email=' + "xxxxxxx@163.com" ) )
req = urllib2.Request( "http://www.zhihu.com/" )
response = opener.open(req)
print response.headers
retdata = response.read()

import urllib2
import socket
socket.setdefaulttimeout(10) # 10 秒钟后超时
urllib2.socket.setdefaulttimeout(10) # 另一种方式

import urllib2
request=urllib2.Request('http://www.zhihu.com')
response = urllib2.urlopen(request,timeout=2)
HTML=response.read()
print HTML

import urllib2
try:
    response = urllib2.urlopen('http://www.google.com')
    print response
except urllib2.HTTPError as e:
    if hasattr(e, 'code'):
        print 'Error code:',e.code

import urllib2
response = urllib2.urlopen('http://www.zhihu.cn')
isRedirected = response.geturl() == 'http://www.zhihu.cn'

import urllib2
class RedirectHandler(urllib2.HTTPRedirectHandler):
    def http_error_301(self, req, fp, code, msg, headers):
        pass
    def http_error_302(self, req, fp, code, msg, headers):
        result = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code,
        msg, headers)
        result.status = code
        result.newurl = result.geturl()
        return result
opener = urllib2.build_opener(RedirectHandler)
opener.open('http://www.zhihu.cn')

import urllib2
proxy = urllib2.ProxyHandler({
	'http': '127.0.0.1:8087'	
})
opener = urllib2.build_opener([proxy,])
urllib2.install_opener(opener)
response = urllib2.urlopen('http://www.zhihu.com/')
print response.read()

import urllib2
proxy = urllib2.ProxyHandler({
	'http': '127.0.0.1:8087'	
})
opener = urllib2.build_opener(proxy,)
response = opener.open("http://www.zhihu.com/")
print response.read()

import httplib
conn =None
try:
    conn = httplib.HTTPConnection("www.zhihu.com")
    conn.request("GET", "/")
    response = conn.getresponse()
    print response.status, response.reason
    print '-' * 40
    headers = response.getheaders()
    for h in headers:
        print h
    print '-' * 40
    print response.msg
except Exception,e:
    print e
finally:
    if conn:
        conn.close()

import httplib, urllib
conn = None
try:
    params = urllib.urlencode({'name': 'qiye', 'age': 22})
    headers = {"Content-type": "application/x-www-form-urlencoded"
    , "Accept": "text/plain"}
    conn = httplib.HTTPConnection("www.zhihu.com", 80, timeout=3)
    conn.request("POST", "/login", params, headers)
    response = conn.getresponse()
    print response.getheaders() # 获取头信息
    print response.status
    print response.read()
except Exception, e:
    print e
    finally:
    if conn:
        conn.close()

import requests
r = requests.get('http://www.baidu.com')
print r.content

import requests
postdata={
	'key':'value'	
}
r = requests.post('http://www.xxxxxx.com/login',data=postdata)
print r.content

r = requests.put('http://www.xxxxxx.com/put', data = {
	'key':'value'	
})
r = requests.delete('http://www.xxxxxx.com/delete')
r = requests.head('http://www.xxxxxx.com/get')
r = requests.options('http://www.xxxxxx.com/get')

import requests
    payload = {'Keywords': 'blog:qiyeboy','pageindex':1}
r = requests.get('http://zzk.cnblogs.com/s/blogpost', params=payload)
print r.url

import requests
r = requests.get('http://www.baidu.com')
print 'content-->'+r.content
print 'text-->'+r.text
print 'encoding-->'+r.encoding
r.encoding='utf-8'
print 'new text-->'+r.text

import requests
r = requests.get('http://www.baidu.com')
print chardet.detect(r.content)
r.encoding = chardet.detect(r.content)['encoding']
print r.text

import requests
r = requests.get('http://www.baidu.com',stream=True)
print r.raw.read(10)

import requests
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers={
	'User-Agent':user_agent	
}
r = requests.get('http://www.baidu.com',headers=headers)
print r.content

import requests
r = requests.get('http://www.baidu.com')
if r.status_code == requests.codes.ok:
    print r.status_code# 响应码
    print r.headers# 响应头
    print r.headers.get('content-type')# 推荐使用这种获取方式, 获取其中的某个字段
    print r.headers['content-type']# 不推荐使用这种获取方式
else:
    r.raise_for_status()

import requests
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers={'User-Agent':user_agent}
r = requests.get('http://www.baidu.com',headers=headers)
# 遍历出所有的 cookie 字段的值
for cookie in r.cookies.keys():
    print cookie+':'+r.cookies.get(cookie)

import requests
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers={
	'User-Agent':user_agent	
}
cookies = dict(name='qiye',age='10')
r = requests.get('http://www.baidu.com',headers=headers,cookies=cookies)
print r.text

import Requests
oginUrl = 'http://www.xxxxxxx.com/login'
s = requests.Session()
# 首先访问登录界面, 作为游客, 服务器会先分配一个 cookie
r = s.get(loginUrl,allow_redirects=True)
datas={
	'name':'qiye','passwd':'qiye'	
}
# 向登录链接发送 post 请求, 验证成功, 游客权限转为会员权限
r = s.post(loginUrl, data=datas,allow_redirects= True)
print r.text

import requests
r = requests.get('http://github.com')
print r.url
print r.status_code
print r.history

https://github.com/
200
(<Response [301]>,)

import requests
proxies = {
    "http": "http://0.10.1.10:3128",
    "https": "http://10.10.1.10:1080",
}
requests.get("http://example.org", proxies=proxies)

proxies = {
    "http": "http://user:pass@10.10.1.10:3128/",
}

来源: http://www.jianshu.com/p/1c011a1c0bff

与本文相关文章

暂无,快来抢沙发吧！