前言:
首先表示抱歉,春节后一直较忙,未及时更新该系列文章。
近期,由于监控的站源越来越多,就偶有站源做了反爬机制,造成我们的 SupportYun 系统小爬虫服务时常被封 IP,不能进行数据采集。
这时候,前面有园友提到的 IP 代理就该上场表演了。
IP 代理池设计:
博主查阅与调研了多方资料,最终决定先通过爬取网络上各大 IP 代理网站免费代理的方式,来建立自己的 IP 代理池。
最终爬取了五家较为优质的 IP 代理站点:
1.
2.
3.
4.
5.
IP 代理池方案设计如下:
简单点说就是把在采集的站源里面已知具有反爬机制的站源打上标签,修改所有的爬虫服务,遇到有此标签的站源先从 IP 代理池随机获取可用的代理 IP 再进行数据爬取。
安装 Redis:
首先,我们需要一台服务器来部署我们的 Redis 服务(先不考虑集群什么的)。
博主一向不喜欢弹个小黑框,不停敲命令行进行操作的各种方式。个人认为,GUI 是推动计算机快速发展的重要因素之一(非喜勿喷)。
翻阅了资料,找到了简易的 redis 安装客户端(windows 版本,安装简单到爆),地址如下:
在博客园找到一篇介绍 redis 配置文件的博文,贴出来供大家参考:
话说博主就简单的修改了一下内存限制,设置了允许外网连接,设置了一个密码,也没多改其他东西。
注意,配置文件在安装完成后的目录下,名称是:Redis.window-server.conf
熟悉一点都知道,redis 的 c# 驱动 ServiceStack.Redis,NuGet 就可以直接安装。比较坑的是 4.0 版本后商业化了,限制每小时 6000 次,要么下载 3.9 版本,要么考虑其他的驱动,例如:StackExchange。
博主使用的是 ServiceStack V3.9 版本,附上下载地址:
下面附上博主基于 ServiceStack 写的 RedisManageService,由于业务简单,只使用到了几个 API,大家凑合着看。
- 1 /// <summary>
- 2 /// 基于ServiceStack的redis操作管理服务
- 3 /// 当前用到set存储
- 4 /// </summary>
- 5 public class RedisManageService
- 6 {
- 7 private static readonly string redisAddress = ConfigurationManager.AppSettings["RedisAddress"];
- 8 private static readonly string redisPassword = "myRedisPassword";
- 9
- 10
- 11 /// <summary>
- 12 /// 获取某set集合 随机一条数据
- 13 /// </summary>
- 14 /// <param name="setName"></param>
- 15 /// <returns></returns>
- 16 public static string GetRandomItemFromSet(RedisSetNameEnum setName)
- 17 {
- 18 using (RedisClient client = new RedisClient(redisAddress, 6379, redisPassword))
- 19 {
- 20 var result = client.GetRandomItemFromSet(setName.ToString());
- 21 if (result == null)
- 22 {
- 23 throw new Exception("redis set集合"+setName.ToString()+"已无数据!");
- 24 }
- 25 return result;
- 26 }
- 27 }
- 28
- 29 /// <summary>
- 30 /// 从某set集合 删除指定数据
- 31 /// </summary>
- 32 /// <param name="setName"></param>
- 33 /// <param name="value"></param>
- 34 /// <returns></returns>
- 35 public static void RemoveItemFromSet(RedisSetNameEnum setName, string value)
- 36 {
- 37 using (RedisClient client = new RedisClient(redisAddress, 6379, redisPassword))
- 38 {
- 39 client.RemoveItemFromSet(setName.ToString(), value);
- 40 }
- 41 }
- 42
- 43 /// <summary>
- 44 /// 添加一条数据到某set集合
- 45 /// </summary>
- 46 /// <param name="setName"></param>
- 47 /// <param name="value"></param>
- 48 public static void AddItemToSet(RedisSetNameEnum setName, string value)
- 49 {
- 50 using (RedisClient client = new RedisClient(redisAddress, 6379, redisPassword))
- 51 {
- 52 client.AddItemToSet(setName.ToString(), value);
- 53 }
- 54 }
- 55
- 56 /// <summary>
- 57 /// 添加一个列表到某set集合
- 58 /// </summary>
- 59 /// <param name="setName"></param>
- 60 /// <param name="values"></param>
- 61 public static void AddItemListToSet(RedisSetNameEnum setName, List<string> values)
- 62 {
- 63 using (RedisClient client = new RedisClient(redisAddress, 6379, redisPassword))
- 64 {
- 65 client.AddRangeToSet(setName.ToString(), values);
- 66 }
- 67 }
- 68
- 69 /// <summary>
- 70 /// 判断某值是否已存在某set集合中
- 71 /// </summary>
- 72 /// <param name="setName"></param>
- 73 /// <param name="value"></param>
- 74 /// <returns></returns>
- 75 public static bool JudgeItemInSet(RedisSetNameEnum setName, string value)
- 76 {
- 77 using (RedisClient client = new RedisClient(redisAddress, 6379, redisPassword))
- 78 {
- 79 return client.Sets[setName.ToString()].Any(t => t == value);
- 80 }
- 81 }
- 82
- 83 /// <summary>
- 84 /// 获取某set数据总数
- 85 /// </summary>
- 86 /// <param name="setName"></param>
- 87 /// <returns></returns>
- 88 public static long GetSetCount(RedisSetNameEnum setName)
- 89 {
- 90 using (RedisClient client = new RedisClient(redisAddress, 6379, redisPassword))
- 91 {
- 92 return client.GetSetCount(setName.ToString());
- 93 }
- 94 }
- 95 }
免费代理 IP 抓取服务实现:
我们首先设计一个最简单的 IpProxy 对象:
- 1 /// <summary>
- 2 /// Ip代理对象
- 3 /// </summary>
- 4 public class IpProxy 5 {
- 6 /// <summary>
- 7 /// IP地址
- 8 /// </summary>
- 9 public string Address {
- get;
- set;
- }
- 10 11 /// <summary>
- 12 /// 端口
- 13 /// </summary>
- 14 public int Port {
- get;
- set;
- }
- 15
- }
然后实现一个基于 Redis 的 Ip 代理池操作服务:
- 1 /// <summary>
- 2 /// 基于Redis的代理池管理服务
- 3 /// </summary>
- 4 public class PoolManageService 5 {
- 6 /// <summary>
- 7 /// 从代理池随机获取一条代理
- 8 /// </summary>
- 9 /// <returns></returns>
- 10 public static string GetProxy() 11 {
- 12 string result = string.Empty;
- 13 14
- try 15 {
- 16 result = RedisManageService.GetRandomItemFromSet(RedisSetNameEnum.ProxyPool);
- 17
- if (result != null) 18 {
- 19
- if (20 ! HttpHelper.IsAvailable(result.Split(new[] {
- ':'
- })[0], 21 int.Parse(result.Split(new[] {
- ':'
- })[1]))) 22 {
- 23 DeleteProxy(result);
- 24
- return GetProxy();
- 25
- }
- 26
- }
- 27
- }
- 28
- catch(Exception e) 29 {
- 30 LogUtils.ErrorLog(new Exception("从代理池获取代理数据出错", e));
- 31
- }
- 32
- return result;
- 33
- }
- 34 35 /// <summary>
- 36 /// 从代理池删除一条代理
- 37 /// </summary>
- 38 /// <param name="value"></param>
- 39 public static void DeleteProxy(string value) 40 {
- 41
- try 42 {
- 43 RedisManageService.RemoveItemFromSet(RedisSetNameEnum.ProxyPool, value);
- 44
- }
- 45
- catch(Exception e) 46 {
- 47 LogUtils.ErrorLog(new Exception("从代理池删除代理数据出错", e));
- 48
- }
- 49
- }
- 50 51 /// <summary>
- 52 /// 添加一条代理到代理池
- 53 /// </summary>
- 54 /// <param name="proxy"></param>
- 55 public static void Add(IpProxy proxy) 56 {
- 57
- try 58 {
- 59
- if (HttpHelper.IsAvailable(proxy.Address, proxy.Port)) 60 {
- 61 RedisManageService.AddItemToSet(RedisSetNameEnum.ProxyPool, proxy.Address + ":" + proxy.Port.ToString());
- 62
- }
- 63
- }
- 64
- catch(Exception e) 65 {
- 66 LogUtils.ErrorLog(new Exception("添加一条代理数据到代理池出错", e));
- 67
- }
- 68
- }
- 69
- }
提供简易的三个方法:添加代理 IP、删除代理 IP、随机获取一条代理 IP
我们还需要一个爬虫服务,来爬取我们需要的免费代理 IP 数据:
- 1 /// <summary>
- 2 /// IP池 抓取蜘蛛
- 3 /// TODO:代理池站点变化较快,时常关注日志监控
- 4 /// </summary>
- 5 public class IpPoolSpider 6 {
- 7 public void Initial() 8 {
- 9 ThreadPool.QueueUserWorkItem(Downloadproxy360);
- 10 ThreadPool.QueueUserWorkItem(DownloadproxyBiGe);
- 11 ThreadPool.QueueUserWorkItem(Downloadproxy66);
- 12 ThreadPool.QueueUserWorkItem(Downloadxicidaili);
- 13
- }
- 14 15 // 下载西刺代理的html页面
- 16 public void Downloadxicidaili(object DATA) 17 {
- 18
- try 19 {
- 20 List < string > list = new List < string > () 21 {
- 22 "http://www.xicidaili.com/nt/",
- 23 "http://www.xicidaili.com/nn/",
- 24 "http://www.xicidaili.com/wn/",
- 25 "http://www.xicidaili.com/wt/"26 27
- };
- 28 foreach(var utlitem in list) 29 {
- 30
- for (int i = 1; i < 5; i++) 31 {
- 32 string url = utlitem + i.ToString();
- 33
- var ipProxy = PoolManageService.GetProxy();
- 34
- if (string.IsNullOrEmpty(ipProxy)) 35 {
- 36 LogUtils.ErrorLog(new Exception("Ip代理池暂无可用代理IP"));
- 37
- return;
- 38
- }
- 39
- var ip = ipProxy;
- 40 WebProxy webproxy;
- 41
- if (ipProxy.Contains(":")) 42 {
- 43 ip = ipProxy.Split(new[] {
- ':'
- })[0];
- 44
- var port = int.Parse(ipProxy.Split(new[] {
- ':'
- })[1]);
- 45 webproxy = new WebProxy(ip, port);
- 46
- }
- 47
- else 48 {
- 49 webproxy = new WebProxy(ip);
- 50
- }
- 51 string html = HttpHelper.DownloadHtml(url, webproxy);
- 52
- if (string.IsNullOrEmpty(html)) 53 {
- 54 LogUtils.ErrorLog(new Exception("代理地址:" + url + " 访问失败"));
- 55
- continue;
- 56
- }
- 57 58 HtmlDocument doc = new HtmlDocument();
- 59 doc.LoadHtml(html);
- 60 HtmlNode node = doc.DocumentNode;
- 61 string xpathstring = "//tr[@class='odd']";
- 62 HtmlNodeCollection collection = node.SelectNodes(xpathstring);
- 63 foreach(var item in collection) 64 {
- 65
- var proxy = new IpProxy();
- 66 string xpath = "td[2]";
- 67 proxy.Address = item.SelectSingleNode(xpath).InnerHtml;
- 68 xpath = "td[3]";
- 69 proxy.Port = int.Parse(item.SelectSingleNode(xpath).InnerHtml);
- 70 Task.Run(() = >71 {
- 72 PoolManageService.Add(proxy);
- 73
- });
- 74
- }
- 75
- }
- 76
- }
- 77
- }
- 78
- catch(Exception e) 79 {
- 80 LogUtils.ErrorLog(new Exception("下载西刺代理IP池出现故障", e));
- 81
- }
- 82
- }
- 83 84 // 下载快代理
- 85 public void Downkuaidaili(object DATA) 86 {
- 87
- try 88 {
- 89 string url = "http://www.kuaidaili.com/proxylist/";
- 90
- for (int i = 1; i < 4; i++) 91 {
- 92 string html = HttpHelper.DownloadHtml(url + i.ToString() + "/", null);
- 93 string xpath = "//tbody/tr";
- 94 HtmlDocument doc = new HtmlDocument();
- 95 doc.LoadHtml(html);
- 96 HtmlNode node = doc.DocumentNode;
- 97 HtmlNodeCollection collection = node.SelectNodes(xpath);
- 98 foreach(var item in collection) 99 {
- 100
- var proxy = new IpProxy();
- 101 proxy.Address = item.FirstChild.InnerHtml;
- 102 xpath = "td[2]";
- 103 proxy.Port = int.Parse(item.SelectSingleNode(xpath).InnerHtml);
- 104 Task.Run(() = >105 {
- 106 PoolManageService.Add(proxy);
- 107
- });
- 108
- }
- 109
- }
- 110
- }
- 111
- catch(Exception e) 112 {
- 113 LogUtils.ErrorLog(new Exception("下载快代理IP池出现故障", e));
- 114
- }
- 115
- }
- 116 117 // 下载proxy360
- 118 public void Downloadproxy360(object DATA) 119 {
- 120
- try 121 {
- 122 string url = "http://www.proxy360.cn/default.aspx";
- 123 string html = HttpHelper.DownloadHtml(url, null);
- 124
- if (string.IsNullOrEmpty(html)) 125 {
- 126 LogUtils.ErrorLog(new Exception("代理地址:" + url + " 访问失败"));
- 127
- return;
- 128
- }
- 129 HtmlDocument doc = new HtmlDocument();
- 130 doc.LoadHtml(html);
- 131 string xpathstring = "//div[@class='proxylistitem']";
- 132 HtmlNode node = doc.DocumentNode;
- 133 HtmlNodeCollection collection = node.SelectNodes(xpathstring);
- 134 135 foreach(var item in collection) 136 {
- 137
- var proxy = new IpProxy();
- 138
- var childnode = item.ChildNodes[1];
- 139 xpathstring = "span[1]";
- 140 proxy.Address = childnode.SelectSingleNode(xpathstring).InnerHtml.Trim();
- 141 xpathstring = "span[2]";
- 142 proxy.Port = int.Parse(childnode.SelectSingleNode(xpathstring).InnerHtml);
- 143 Task.Run(() = >144 {
- 145 PoolManageService.Add(proxy);
- 146
- });
- 147
- }
- 148
- }
- 149
- catch(Exception e) 150 {
- 151 LogUtils.ErrorLog(new Exception("下载proxy360IP池出现故障", e));
- 152
- }
- 153
- }
- 154 155 // 下载逼格代理
- 156 public void DownloadproxyBiGe(object DATA) 157 {
- 158
- try 159 {
- 160 List < string > list = new List < string > () 161 {
- 162 "http://www.bigdaili.com/dailiip/1/{0}.html",
- 163 "http://www.bigdaili.com/dailiip/2/{0}.html",
- 164 "http://www.bigdaili.com/dailiip/3/{0}.html",
- 165 "http://www.bigdaili.com/dailiip/4/{0}.html"166
- };
- 167 foreach(var utlitem in list) 168 {
- 169
- for (int i = 1; i < 5; i++) 170 {
- 171 string url = String.Format(utlitem, i);
- 172 string html = HttpHelper.DownloadHtml(url, null);
- 173
- if (string.IsNullOrEmpty(html)) 174 {
- 175 LogUtils.ErrorLog(new Exception("代理地址:" + url + " 访问失败"));
- 176
- continue;
- 177
- }
- 178 179 HtmlDocument doc = new HtmlDocument();
- 180 doc.LoadHtml(html);
- 181 HtmlNode node = doc.DocumentNode;
- 182 string xpathstring = "//tbody/tr";
- 183 HtmlNodeCollection collection = node.SelectNodes(xpathstring);
- 184 foreach(var item in collection) 185 {
- 186
- var proxy = new IpProxy();
- 187
- var xpath = "td[1]";
- 188 proxy.Address = item.SelectSingleNode(xpath).InnerHtml;
- 189 xpath = "td[2]";
- 190 proxy.Port = int.Parse(item.SelectSingleNode(xpath).InnerHtml);
- 191 Task.Run(() = >192 {
- 193 PoolManageService.Add(proxy);
- 194
- });
- 195
- }
- 196
- }
- 197
- }
- 198
- }
- 199
- catch(Exception e) 200 {
- 201 LogUtils.ErrorLog(new Exception("下载逼格代理IP池出现故障", e));
- 202
- }
- 203
- }
- 204 205 // 下载66免费代理
- 206 public void Downloadproxy66(object DATA) 207 {
- 208
- try 209 {
- 210 List < string > list = new List < string > () 211 {
- 212 "http://www.66ip.cn/areaindex_35/index.html",
- 213 "http://www.66ip.cn/areaindex_35/2.html",
- 214 "http://www.66ip.cn/areaindex_35/3.html"215
- };
- 216 foreach(var utlitem in list) 217 {
- 218 string url = utlitem;
- 219 string html = HttpHelper.DownloadHtml(url, null);
- 220
- if (string.IsNullOrEmpty(html)) 221 {
- 222 LogUtils.ErrorLog(new Exception("代理地址:" + url + " 访问失败"));
- 223
- break;
- 224
- }
- 225 226 HtmlDocument doc = new HtmlDocument();
- 227 doc.LoadHtml(html);
- 228 HtmlNode node = doc.DocumentNode;
- 229 string xpathstring = "//table[@bordercolor='#6699ff']/tr";
- 230 HtmlNodeCollection collection = node.SelectNodes(xpathstring);
- 231 foreach(var item in collection) 232 {
- 233
- var proxy = new IpProxy();
- 234
- var xpath = "td[1]";
- 235 proxy.Address = item.SelectSingleNode(xpath).InnerHtml;
- 236
- if (proxy.Address.Contains("ip")) 237 {
- 238
- continue;
- 239
- }
- 240 xpath = "td[2]";
- 241 proxy.Port = int.Parse(item.SelectSingleNode(xpath).InnerHtml);
- 242 Task.Run(() = >243 {
- 244 PoolManageService.Add(proxy);
- 245
- });
- 246
- }
- 247
- }
- 248
- }
- 249
- catch(Exception e) 250 {
- 251 LogUtils.ErrorLog(new Exception("下载66免费代理IP池出现故障", e));
- 252
- }
- 253
- }
- 254
- }
这段代码也没什么营养,就不仔细解释了。
前面有说到,博主的爬虫服务都是以 windows 服务的方式部署的。以前一直用 Timer 来实现固定间隔多次循环,这次博主引用了 Quartz.NET 任务调度框架来做,代码看起来更优美一点。
Quartz.NET 可直接在 NuGet 下载安装。
先写一个代理池的总调度任务类 ProxyPoolTotalJob,继承 IJob 接口:
- 1 /// <summary>
- 2 /// 代理池总调度任务
- 3 /// </summary>
- 4 class ProxyPoolTotalJob: IJob 5 {
- 6 public void Execute(IJobExecutionContext context) 7 {
- 8
- var spider = new IpPoolSpider();
- 9 spider.Initial();
- 10
- }
- 11
- }
接下来是在 OnStart 中运行的 Run() 方法实现:
- 1 private static void Run()
- 2 {
- 3 try
- 4 {
- 5 StdSchedulerFactory factory = new StdSchedulerFactory();
- 6 IScheduler scheduler = factory.GetScheduler();
- 7 scheduler.Start();
- 8 IJobDetail job = JobBuilder.Create().WithIdentity("job1", "group1").Build();
- 9 ITrigger trigger = TriggerBuilder.Create()
- 10 .WithIdentity("trigger1", "group1")
- 11 .StartNow()
- 12 .WithSimpleSchedule(
- 13 x => x
- 14 .WithIntervalInMinutes(28) // 28分钟一次
- 15 .RepeatForever()
- 16 ).Build();
- 17 scheduler.ScheduleJob(job, trigger);
- 18
- 19 }
- 20 catch (SchedulerException se)
- 21 {
- 22 Console.WriteLine(se);
- 23 }
- 24 }
最后采集具有反爬机制的 html 页面的时候,使用代理 IP,这个相信大家都会,设置一下 webRequest 的 Proxy 参数即可。
webRequest.Proxy = new WebProxy(ip, port);
以上,就实现了一个基于 redis 的免费代理 IP 池。我们被封 IP 的爬虫服务又满血复活了,继续采集新数据去。
来源: http://www.cnblogs.com/csqb-511612371/p/6552838.html