小白学 Python 爬虫 (31): 自己构建一个简单的代理池

人生苦短, 我用 Python

前文传送门:

小白学 Python 爬虫(1): 开篇 https://www.geekdigging.com/2019/11/13/3303836941/

小白学 Python 爬虫 (2): 前置准备(一) 基本类库的安装 https://www.geekdigging.com/2019/11/20/2586166930/

小白学 Python 爬虫(3): 前置准备(二)Linux 基础入门 https://www.geekdigging.com/2019/11/21/1005563697/

小白学 Python 爬虫(4): 前置准备(三)Docker 基础入门 https://www.geekdigging.com/2019/11/22/3679472340/

小白学 Python 爬虫 (5): 前置准备(四) 数据库基础 https://www.geekdigging.com/2019/11/24/334078215/

小白学 Python 爬虫 (6): 前置准备(五) 爬虫框架的安装 https://www.geekdigging.com/2019/11/25/1881661601/

小白学 Python 爬虫(7):HTTP 基础 https://www.geekdigging.com/2019/11/26/1197821400/

小白学 Python 爬虫(8): 网页基础 https://www.geekdigging.com/2019/11/27/101847406/

小白学 Python 爬虫(9): 爬虫基础 https://www.geekdigging.com/2019/11/28/1668465912/

小白学 Python 爬虫(10):Session 和 Cookies https://www.geekdigging.com/2019/12/01/2475257648/

小白学 Python 爬虫(11):urllib 基础使用(一) https://www.geekdigging.com/2019/12/02/2333822325/

小白学 Python 爬虫(12):urllib 基础使用(二) https://www.geekdigging.com/2019/12/03/819896244/

小白学 Python 爬虫(13):urllib 基础使用(三) https://www.geekdigging.com/2019/12/04/2992515886/

小白学 Python 爬虫(14):urllib 基础使用(四) https://www.geekdigging.com/2019/12/05/104488944/

小白学 Python 爬虫(15):urllib 基础使用(五) https://www.geekdigging.com/2019/12/07/2788855167/

小白学 Python 爬虫(16):urllib 实战之爬取妹子图 https://www.geekdigging.com/2019/12/09/1691033431/

小白学 Python 爬虫(17):Requests 基础使用 https://www.geekdigging.com/2019/12/10/1910005577/

小白学 Python 爬虫(18):Requests 进阶操作 https://www.geekdigging.com/2019/12/11/1468953802/

小白学 Python 爬虫(19):Xpath 基操 https://www.geekdigging.com/2019/12/12/3568648672/

小白学 Python 爬虫(20):Xpath 进阶 https://www.geekdigging.com/2019/12/13/2569867940/

小白学 Python 爬虫(21): 解析库 Beautiful Soup(上) https://www.geekdigging.com/2019/12/15/2789385418/

小白学 Python 爬虫(22): 解析库 Beautiful Soup(下) https://www.geekdigging.com/2019/12/16/876770087/

小白学 Python 爬虫(23): 解析库 pyquery 入门 https://www.geekdigging.com/2019/12/17/876770088/

小白学 Python 爬虫(24):2019 豆瓣电影排行 https://www.geekdigging.com/2019/12/18/1275791678/

小白学 Python 爬虫(25): 爬取股票信息 https://www.geekdigging.com/2019/12/19/1066903974/

小白学 Python 爬虫(26): 为啥买不起上海二手房你都买不起 https://www.geekdigging.com/2019/12/20/788803015/

小白学 Python 爬虫(27): 自动化测试框架 Selenium 从入门到放弃(上) https://www.geekdigging.com/2019/12/22/151891020/

小白学 Python 爬虫(28): 自动化测试框架 Selenium 从入门到放弃(下) https://www.geekdigging.com/2019/12/24/1100772905/

小白学 Python 爬虫(29):Selenium 获取某大型电商网站商品信息 https://www.geekdigging.com/2019/12/25/7469407721/

小白学 Python 爬虫(30): 代理基础 https://www.geekdigging.com/2019/12/26/9565104888/

MYSQL_HOST = 'localhost'
MYSQL_PORT = 3306
MYSQL_USER = 'root'
MYSQL_PASSWORD = 'password'
MYSQL_DB ='test'
MYSQL_CHARSET = 'utf8mb4'
import pymysql
import uuid
class MysqlClient(object):
    def __init__(self, host=MYSQL_HOST, port=MYSQL_PORT, user=MYSQL_USER, password=MYSQL_PASSWORD, database=MYSQL_DB, charset=MYSQL_CHARSET):
        """
        初始化 mysql 连接
        :param host: mysql 地址
        :param port: mysql 端口
        :param user: mysql 用户
        :param password: mysql 密码
        :param database: mysql scheme
        :param charset: 使用的字符集
        """
        self.conn = pymysql.connect(
            host = host,
            port = port,
            user = user,
            password = password,
            database = database,
            charset = charset
        )
    def add_proxy(self, proxy):
        """
        新增代理
        :param proxy: 代理字典
        :return:
        """        sql ='INSERT INTO `proxy_pool` VALUES (%(id)s, %(scheme)s, %(ip)s, %(port)s, %(status)s, %(response_time)s, now(), null )'
        data = {
            "id": str(uuid.uuid1()),
            "scheme": proxy['scheme'],
            "ip": proxy['ip'],
            "port": proxy['port'],
            "status": proxy['status'],
            "response_time": proxy['response_time'],
        }
        self.conn.cursor().execute(sql, data)
        self.conn.commit()
    def find_all(self):
        """
        获取所有可用代理
        :return:
        """        sql ='SELECT * FROM proxy_pool WHERE status = "1" ORDER BY update_date ASC '
        cursor = self.conn.cursor()
        cursor.execute(sql)
        res = cursor.fetchall()
        cursor.close()
        self.conn.commit()
        return res
    def update_proxy(self, proxy):
        """
        更新代理信息
        :param proxy: 需要更新的代理
        :return:
        """        sql ='UPDATE proxy_pool SET scheme = %(scheme)s, ip = %(ip)s, port = %(port)s, status = %(status)s, response_time = %(response_time)s, update_date = now()  WHERE id = %(id)s '
        data = {
            "id": proxy['id'],
            "scheme": proxy['scheme'],
            "ip": proxy['ip'],
            "port": proxy['port'],
            "status": proxy['status'],
            "response_time": proxy['response_time'],
        }
        self.conn.cursor().execute(sql, data)
        self.conn.commit()

import requests
from pyquery import PyQuery
from MysqlClient import MysqlClient
from VerifyProxy import VerifyProxy
class CrawlProxy(object):
    def __init__(self):
        self.MySQL = MysqlClient()
        self.verify = VerifyProxy()
    def get_page(self, url, charset):
        response = requests.get(url)
        response.encoding = charset
        return response.text
    def crawl_ip3366(self, page_num = 3):
        """
        获取代理 ip3366
        :param page_num:
        :return:
        """        start_url ='http://www.ip3366.net/?stype=1&page={}'
        urls = [start_url.format(page) for page in range(1, page_num + 1)]
        for url in urls:
            print('crawl:', url)
            html = self.get_page(url, 'gb2312')
            if HTML:
                d = PyQuery(HTML)
                trs = d('.table-bordered tbody tr').items()
                for tr in trs:
                    scheme = tr.find('td:nth-child(4)').text().lower()
                    ip = tr.find('td:nth-child(1)').text()
                    port = tr.find('td:nth-child(2)').text()
                    verify_result = self.verify.verify_proxy(scheme, ip, port)
                    if verify_result["status"] == '1':
                        proxy = {
                            "scheme": scheme,
                            "ip": ip,
                            "port": port,
                            "status": verify_result["status"],
                            "response_time": verify_result["response_time"],
                        }
                        # 存入数据库
                        self.MySQL.add_proxy(proxy)
                        print('代理', ip, '连通测试已通过, 已保存 Mysql')
                    else:
                        print('代理', ip, '连通测试未通过')
if __name__ == '__main__':
    CrawlProxy().crawl_ip3366()

import requests
from MysqlClient import MysqlClient
class VerifyProxy(object):
    def __init__(self):
        self.MySQL = MysqlClient()
    def verify_proxy(self, scheme, ip, port):
        """
        使用百度测试代理的连通性, 并返回响应时长(单位: ms)
        :param scheme:
        :param ip:
        :param port:
        :return:
        """
        proxies = {
            scheme: scheme + '://' + ip + ':' + port + '/'
        }
        response_time = 0
        status = '0'
        try:
            response = requests.get(scheme + '://www.baidu.com/get', proxies=proxies)
            if response.ok:
                response_time = round(response.elapsed.total_seconds() * 1000)
                status = '1'
            else:
                response_time = 0
                status = '0'
        except:
            pass
        return {"response_time" : response_time, "status" : status}
    def verify_all(self):
        """
        验证住方法, 从数据库中获取所有代理进行验证
        :return:
        """
        results = self.MySQL.find_all()
        for result in results:
            res = self.verify_proxy(result[1], result[2], result[3])
            proxy = {
                "id": result[0],
                "scheme": result[1],
                "ip": result[2],
                "port": result[3],
                "status": res["status"],
                "response_time": res["response_time"],
            }
            self.MySQL.update_proxy(proxy)
            print('代理验证成功')
if __name__ == '__main__':
    VerifyProxy().verify_all()

来源: https://www.cnblogs.com/babycomeon/p/12143325.html

与本文相关文章

暂无,快来抢沙发吧！