如何用 python 写一个爬虫, 下载某论坛的动漫图片
实现过程
获取网页内容
- import urllib.request
- import re
- import JSON
- def getHttpStatusCode(tempUrl):
- """检验 url 是否可以正常访问"""
- opener = urllib.request.build_opener()
- opener.addheaders = [("User-agent", "Mozilla/49.0.2")]
- try:
- opener.open(tempUrl)
- # print(tempUrl+'没问题')
- return 0
- except urllib.error.HTTPError:
- print(tempUrl + "= 访问页面出错")
- return 1
- except urllib.error.URLError:
- print(tempUrl + "= 访问页面出错")
- return 2
- def getcontent(url):
- """获取链接 html 内容"""
- headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64)"}
- request = urllib.request.Request(url, headers=headers)
- response = urllib.request.urlopen(request)
- data = response.read().decode("utf-8")
- return data
分析内容中的信息, 提取需要的链接
- def getcatalogarray(content):
- """获取 4chan - catalog 数据 ---var catalog = {......};var style_group---"""
- pattern = r"var\scatalog\s=(.*).var\sstyle_group"
- res = re.search(pattern, content, re.M | re.S)
- return res.group(1)
获取网页中的图片下载链接
- def getimageurls(content):
- """获取网页中的图片链接 <a href="//i.4cdn.org/a/1553711154845s.jpg""""
- pattern = r"a\shref=\"(//i.4cdn.org/.*?\.[j|p|g][p|n|i][g|f])\""
- res = re.findall(pattern, content)
- return res
下载图片
- urllib.request.urlretrieve(
- image_download_url, img_save_path
- )
完整代码
tools.py
- # -*- coding=utf-8 -*-
- # author vvyun
- import urllib.request
- import re
- import JSON
- """
- http 工具
- """
- def getcontent(url):
- """获取链接 html 内容"""
- headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64)"}
- request = urllib.request.Request(url, headers=headers)
- response = urllib.request.urlopen(request)
- data = response.read().decode("utf-8")
- return data
- def savestr2file(filename, content):
- """存储字符串到文件"""
- output = open(filename, "w+", encoding="utf8")
- output.write(content)
- output.close()
- def getcatalogarray(content):
- """获取 4chan - catalog 数据 ---var catalog = {......};var style_group---"""
- pattern = r"var\scatalog\s=(.*).var\sstyle_group"
- res = re.search(pattern, content, re.M | re.S)
- return res.group(1)
- def getimageurls(content):
- """获取网页中的图片链接 <a href="//i.4cdn.org/a/1553711154845s.jpg""""
- pattern = r"a\shref=\"(//i.4cdn.org/.*?\.[j|p|g][p|n|i][g|f])\""
- res = re.findall(pattern, content)
- return res
- # 检验 url 是否可以正常访问
- def getHttpStatusCode(tempUrl):
- """检验 url 是否可以正常访问"""
- opener = urllib.request.build_opener()
- opener.addheaders = [("User-agent", "Mozilla/49.0.2")]
- try:
- opener.open(tempUrl)
- # print(tempUrl+'没问题')
- return 0
- except urllib.error.HTTPError:
- print(tempUrl + "= 访问页面出错")
- return 1
- except urllib.error.URLError:
- print(tempUrl + "= 访问页面出错")
- return 2
- def mkdir(path):
- # 引入模块
- import os
- # 去除首位空格
- path=path.strip()
- # 去除尾部 \ 符号
- path=path.rstrip("\\")
- # 判断路径是否存在
- # 存在 True
- # 不存在 False
- isExists=os.path.exists(path)
- # 判断结果
- if not isExists:
- # 如果不存在则创建目录
- # 创建目录操作函数
- os.makedirs(path)
- return True
- else:
- return False
main_a.py
- # -*- coding=utf-8 -*-
- # author vvyun
- import re
- import JSON
- from tools import *
- import os
- # 板块名 4chan/a
- basec = "a"
- urlindexbase = "https://boards.4chan.org/" + basec + "/catalog"
- urlthreadbase = "https://boards.4chan.org/" + basec + "/thread/"
- # 获取首页 HTML 内容
- content = getcontent(urlindexbase)
- # 保存首页 HTML 内容到本地
- # filename = "image/content_4chan_e.html"
- # savestr2file(filename, content)
- # 获取目录信息
- catalog_index = getcatalogarray(content)
- # 保存目录信息到本地
- # filenameimg = "image/catalog_4chan_e.json"
- # savestr2file(filenameimg, catalog_index)
- # 获取 threads 信息
- catalog_threads = JSON.loads(catalog_index)["threads"]
- # 循环获取每个 thread 的所有图片
- for thread_url in catalog_threads:
- print(urlthreadbase + thread_url)
- # 获取 thread 页面 HTML 内容
- content_thread = getcontent(urlthreadbase + thread_url)
- # 保存 thread 页面 HTML 内容
- # filename = "image/html/" + thread_url + ".html"
- # savestr2file(filename, content_thread)
- # 获取 thread 页面图片链接
- imagedata = getimageurls(content_thread)
- # 保存路径
- img_save_basepath = "image/data/" + thread_url + "/"
- # 创建保存目录
- mkdir(img_save_basepath)
- for image_url in imagedata:
- # 下载链接
- imd = "https:" + image_url
- print(imd)
- try:
- if getHttpStatusCode(imd) < 1:
- urllib.request.urlretrieve(
- imd, img_save_basepath + image_url.replace("/", "")
- )
- except Exception as e:
- raise e
- GitHub : https://github.com/vvyun/python-web-crawler/tree/master/a_4chan
来源: http://www.jianshu.com/p/6ff012617e2d