Python 爬虫 (5) 借助搜狗搜索爬取微信文章

借助搜狗搜索爬取微信文章
from urllib import request as r
import re as e
from urllib import error as o
import time as t
# 模拟成浏览器
headers = {
	"User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) ApplewebKit/537.36 (Khtml, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6788.400 QQBrowser/10.3.2767.400"	
}
opener = r.build_opener()
opener.addheaders = [headers]
# 将 opener 安装为全局
r.install_opener(opener)
# 设置一个列表 listurl 存储文章网址列表
listurl = []
# 自定义函数, 功能为使用代理服务器
def use_proxy(proxy_addr,url):
    #建立异常处理机制
try:
proxy = r.ProxyHandler({
	'http':proxy_addr	
})
opener = r.build_opener(proxy,r.HTTPHandler)
r.install_opener(opener)
data = r.urlopen(url).read().decode('utf-8')
return data
except o.URLError as u:
if hasattr(u,'code'):
print(u.code)
if hasattr(u,'reason'):
print(u.reason)
        #若为 URLError 异常, 延迟 10 秒执行
t.sleep(10)
except Exception as x:
print('Exception:'+str(x))
        #若为 Exception 异常, 延迟 1 秒执行
t.sleep(1)
# 获得所有文章链接
def getlisturl(key,pagestart,pageend,proxy):
try:
page = pagestart
        #编码关键字 key
keycode = r.quote(key)
        #编码 &page
        # pagecode = r.quote("&page")
        #循环抓取各页面链接
for page in range(pagestart,pageend+1):
            #分别构建各页面的 url
url = "http://weixin.sogou.com/weixin?type=2&query="+keycode+"&page="+str(page)
            #用代理服务器爬取, 解决 IP 封杀问题
data1 = use_proxy(proxy,url)
            #获取文章链接的正则表达式
listurlpat = '<div class="txt-box">.*?(http://.*?)"'
listurlpat = 'href="(http://.*?)"'
            #获取每页的所有文章链接并添加到列表 listurl 中
d = e.compile(listurlpat,e.S).findall(data1)
listurl.append(d)
        #测试代码
print("共获得到"+str(len(listurl))+"页")
return listurl
except o.URLError as u:
if hasattr(u,'code'):
print(u.code)
if hasattr(u,'reason'):
print(u.reason)
        #若为 URLError 异常, 延迟 10 秒执行
t.sleep(10)
except Exception as x:
print('Exception:'+str(x))
        #若为 Exception 异常, 延迟 1 秒执行
t.sleep(1)
# 通过文章链接获取对应内容
def getcontent(listurl,proxy):
i = 0
    #设置本地文件中的开始 HTML 编码
html1='''<!DOCTYPE html PUBLIC"-//W3C//DTD XHTML 1.0 Transitional//EN""http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; chatset=utf-8" />
<title > 微信文章页面 </title>
</head>
<body>'''fh = open("G:\\Pcode\\1.html","wb")
fh.write(html1.encode("utf-8"))
fh.close()
    #再次以最佳写入的方式打开文件, 以写入对应文章内容
fh = open("G:\\Pcode\\1.html","ab")
    #此时 listurl 为二维列表, 形如 listurl[][], 第一纬存储的信息跟第几页相关, 第二纬存储的跟该页面第几个文章链接相关
for i in range(0,len(listurl)):
for j in range(0,len(listurl[i])):
try:
url = listurl[i][j]
                #处理成真实 url, 可以观察对应网址的关系自行分析, 采集网址比真实网址多了一串 amp
url = url.replace("amp;","")
                #使用代理去爬取对应网址的内容
data = use_proxy(proxy,url)
                #文章标题正则表达式
titlepat = "<title>(.*?)</title>"
                #文章内容正则表达式
contentpat = 'id="js_content">(.*?)id="js_sg_bar"'
                #通过对应正则表达式找到标题并赋给列表 title
title = e.compile(titlepat).findall(data)
                #通过对应正则表达式找到内容并赋给列表 content
content = e.compile(contentpat,e.S).findall(data)
                #初始化标题与内容
thistitle = "此次没有获取到"
thiscontent = "此次没有获取到"
                #如果标题列表不为空, 说明找到了标题, 取列表第 0 个元素, 即此次标题赋给变量 thistitle
if(title != []):
thistitle = title[0]
if(content != []):
thiscontent = content[0]
                #将标题与内容汇总赋给变量 dataall
dataall = "<p > 标题为:"+thistitle+"</p><p > 内容为:"+thiscontent+"</p><br/>"
                #将该篇文章的标题与内容的总信息写入对应文件
fh.write(dataall.encode('utf-8'))
print("第"+str(i+1)+"个网页第"+str(j+1)+"次处理") #便于调试
except o.URLError as u:
if hasattr(u, 'code'):
print(u.code)
if hasattr(u, 'reason'):
print(u.reason)
                # 若为 URLError 异常, 延迟 10 秒执行
t.sleep(10)
except Exception as x:
print('Exception:' + str(x))
                # 若为 Exception 异常, 延迟 1 秒执行
t.sleep(1)
fh.close()
    #设置并写入本地文件的 HTML 后面结束部分代码
html2='''</body>
</html>'''fh = open("G:\\Pcode\\1.html","ab")
fh.write(html2.encode("utf-8"))
fh.close()
# 设置关键字
key = "物联网"
# 设置代码服务器, 该代理服务器有可能失效
proxy = '119.101.113.217:9999'
# 可以为 getlisturl() 与 getcontent() 设置不同的代理服务器, 此处没有启用该项设置
proxy2 = ''
# 起始页
pagestart = 1
# 终止页
pageend = 2
listurl = getlisturl(key,pagestart,pageend,proxy)
getcontent(listurl,proxy)
来源: http://www.bubuko.com/infodetail-3016105.html
与本文相关文章

暂无,快来抢沙发吧！