先来定义个采集规则接口, 根据规则获取单个或一批内容.
- /// <summary>
- /// 采集规则接口
- /// </summary>
- public interface IDataSplider
- {
- /// <summary>
- /// 得到内容
- /// </summary>
- /// <param name="rule"></param>
- /// <returns></returns>
- List<SpliderContent> GetByRule(SpliderRule rule);
- /// <summary>
- /// 得到属性信息
- /// </summary>
- /// <param name="node"></param>
- /// <param name="rule"></param>
- /// <returns></returns>
- List<Field> GetFields(htmlNode node, SpliderRule rule);
- }
必不可少的规则类, 用来配置 XPath 根路径.
- /// <summary>
- /// 采集规则 - 能满足列表页 / 详情页.
- /// </summary>
- public class SpliderRule
- {
- public string Id { get; set; }
- public string Url { get; set; }
- /// <summary>
- /// 网页块
- /// </summary>
- public string ContentXPath { get; set; }
- /// <summary>
- /// 支持列表式
- /// </summary>
- public string EachXPath { get; set; }
- /// <summary>
- ///
- /// </summary>
- public List<RuleField> RuleFields { get; set; }
- }
然后就是属性字段的自定义设置, 这里根据内容特性, 加入了正则支持. 例如评论数是数字, 可用正则筛选出数字. 还有 Attribute 字段, 用来获取 node 的 Attribute 信息.
- /// <summary>
- /// 自定义属性字段
- /// </summary>
- public class RuleField
- {
- public string Id { get; set; }
- public string DisplayName { get; set; }
- /// <summary>
- /// 用于存储的别名
- /// </summary>
- public string FieldName { get; set; }
- public string XPath { get; set; }
- public string Attribute { get; set; }
- /// <summary>
- /// 针对获取的 HTml 正则过滤
- /// </summary>
- public string InnerHtmlRegex { get; set; }
- /// <summary>
- /// 针对获取的 Text 正则过滤
- /// </summary>
- public string InnerTextRegex { get; set; }
- /// <summary>
- /// 是否优先取 InnerText
- /// </summary>
- public bool IsFirstInnerText { get; set; }
- }
下面是根据文章爬虫规则的解析步骤, 实现接口 IDataSplider
- /// <summary>
- /// 支持列表和详情页
- /// </summary>
- public class ArticleSplider : IDataSplider
- {
- /// <summary>
- /// 根据 Rule
- /// </summary>
- /// <param name="rule"></param>
- /// <returns></returns>
- public List<SpliderContent> GetByRule(SpliderRule rule)
- {
- var url = rule.Url;
- Htmlweb web = new HtmlWeb();
- //1. 支持从 web 或本地 path 加载 html
- var htmlDoc = web.Load(url);
- var contentnode = htmlDoc.DocumentNode.SelectSingleNode(rule.ContentXPath);
- var list = new List<SpliderContent>();
- // 列表页
- if (!string.IsNullOrWhiteSpace(rule.EachXPath))
- {
- var itemsNodes = contentnode.SelectNodes(rule.EachXPath);
- foreach (var item in itemsNodes)
- {
- var fields = GetFields(item, rule);
- list.Add(new SpliderContent()
- {
- Fields = fields,
- SpliderRuleId = rule.Id
- });
- }
- return list;
- }
- // 详情页
- var cfields = GetFields(contentnode, rule);
- list.Add(new SpliderContent()
- {
- Fields = cfields,
- SpliderRuleId = rule.Id
- });
- return list;
- }
- public List<Field> GetFields(HtmlNode item, SpliderRule rule)
- {
- var fields = new List<Field>();
- foreach (var rulefield in rule.RuleFields)
- {
- var field = new Field() { DisplayName = rulefield.DisplayName, FieldName = "" };
- var fieldnode = item.SelectSingleNode(rulefield.XPath);
- if (fieldnode != null)
- {
- field.InnerHtml = fieldnode.InnerHtml;
- field.InnerText = fieldnode.InnerText;
- field.AfterRegexHtml = !string.IsNullOrWhiteSpace(rulefield.InnerHtmlRegex) ? Regex.Replace(fieldnode.InnerHtml, rulefield.InnerHtmlRegex, "") : fieldnode.InnerHtml;
- field.AfterRegexText = !string.IsNullOrWhiteSpace(rulefield.InnerTextRegex) ? Regex.Replace(fieldnode.InnerText, rulefield.InnerTextRegex, "") : fieldnode.InnerText;
- //field.AfterRegexHtml = Regex.Replace(fieldnode.InnerHtml, rulefield.InnerHtmlRegex, "");
- //field.AfterRegexText = Regex.Replace(fieldnode.InnerText, rulefield.InnerTextRegex, "");
- if (!string.IsNullOrWhiteSpace(rulefield.Attribute))
- {
- field.Value = fieldnode.Attributes[rulefield.Attribute].Value;
- }
- else
- {
- field.Value = rulefield.IsFirstInnerText ? field.AfterRegexText : field.AfterRegexHtml;
- }
- }
- fields.Add(field);
- }
- return fields;
- }
- }
还是以博客园为例, 配置内容和属性的自定义规则
- /// <summary>
- ///
- /// </summary>
- public void RunArticleRule()
- {
- var postitembodyXPath = "div[@class='post_item_body']//";
- var postitembodyFootXPath = postitembodyXPath+ "div[@class='post_item_foot']//";
- var rule = new SpliderRule()
- {
- ContentXPath = "//div[@id='post_list']",
- EachXPath = "div[@class='post_item']",
- Url = "https://www.cnblogs.com",
- RuleFields = new List<RuleField>() {
- new RuleField(){ DisplayName="推荐", XPath="*//span[@class='diggnum']", IsFirstInnerText=true },
- new RuleField(){ DisplayName="标题",XPath=postitembodyXPath+"a[@class='titlelnk']", IsFirstInnerText=true },
- new RuleField(){ DisplayName="URL",XPath=postitembodyXPath+"a[@class='titlelnk']",Attribute="href", IsFirstInnerText=true },
- new RuleField(){ DisplayName="简要",XPath=postitembodyXPath+"p[@class='post_item_summary']", IsFirstInnerText=true },
- new RuleField(){ DisplayName="作者",XPath=postitembodyFootXPath+"a[@class='lightblue']", IsFirstInnerText=true },
- new RuleField(){ DisplayName="作者 URL",XPath=postitembodyFootXPath+"a[@class='lightblue']",Attribute="href", IsFirstInnerText=true },
- new RuleField(){ DisplayName="讨论数", XPath="span[@class='article_comment']",IsFirstInnerText=true, InnerTextRegex=@"[^0-9]+" },
- new RuleField(){ DisplayName="阅读数", XPath=postitembodyFootXPath+"span[@class='article_view']",IsFirstInnerText=true, InnerTextRegex=@"[^0-9]+" },
- }
- };
- var splider = new ArticleSplider();
- var list = splider.GetByRule(rule);
- foreach (var item in list)
- {
- var msg = string.Empty;
- item.Fields.ForEach(M =>
- {
- if (M.DisplayName != "简要" && !M.DisplayName.Contains("URL"))
- {
- msg += $"{M.DisplayName}:{M.Value}";
- }
- });
- Console.WriteLine(msg);
- }
- }
来源: https://www.cnblogs.com/fancunwei/p/9588629.html