我们的战场就选在博客园吧,假设现在要采集博客园首页的所有文章信息 包括文章标题,链接接 作者博客地址,文章简介,文章发布时间,阅读数据,评论数 ,推荐数。
先看博客园文章的Html格式
<div> <div> <div> <span>4</span> </div> <div></div> <div></div> </div> <div> <h3><a href="https://www.cnblogs.com/swq6413/p/3439076.html" target="_blank">分享完整的项目工程目录结构</a></h3> <p> <a href="https://www.cnblogs.com/swq6413/" target="_blank"><img src="https://pic.cnitblog.com/face/142964/20131116170946.png" alt=""/></a> 在项目开发过程中,如何有序的保存项目中的各类数据文件,建立一个分类清晰、方便管理的目录结构是非常重要的。 综合以前的项目和一些朋友的项目结构,我整理了一份我觉得还不错的项目目录结构。 在这里分享给大家,欢迎各位提出你宝贵的意见和建议。如果喜欢请“推荐”则个,感激万分!! 整个目录设置到4级子目录,实... </p> <div> <a href="https://www.cnblogs.com/swq6413/">七少爷</a> 发布于 2013-11-23 15:48 <span><a href="http://www.cnblogs.com/swq6413/p/3439076.html#commentform" title="2013-11-23 16:40"> 评论(4)</a></span><span><a href="https://www.cnblogs.com/swq6413/p/3439076.html">阅读(206)</a></span></div> </div> <div></div> </div>
通过构造一个Http请求来取到数据并对数据进行相应处理得到关键信息,在过滤Html标签取文章时正则的强大的威力就体现出来了,
正则的知识点也都基本用上了比如 "\s \w+ . * ? "还有捕获分组,零宽断言等等。喜欢的朋友可以试一试,然后自己看如何通过正则取相应数据的,代码中的正则都是很基本简单的,其意思与用法都在上文中详细写了。
class Program { static void Main(string[] args) { string content = HttpUtility.HttpGetHtml(); HttpUtility.GetArticles(content); } } internal class HttpUtility { //默认获取第一页数据 public static string HttpGetHtml() { HttpWebRequest request = (HttpWebRequest)WebRequest.Create("http://www.cnblogs.com/"); request.Accept = "text/plain, */*; q=0.01"; request.Method = "GET"; request.Headers.Add("Accept-Language", "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3"); request.ContentLength = 0; request.Host = "www.cnblogs.com"; request.UserAgent = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.1.3.5000 Chrome/26.0.1410.43 Safari/537.1"; HttpWebResponse response = (HttpWebResponse)request.GetResponse(); Stream responStream = response.GetResponseStream(); StreamReader reader = new StreamReader(responStream, Encoding.UTF8); string content = reader.ReadToEnd(); return content; } public static List<Article> GetArticles(string htmlString) { List<Article> articleList = new List<Article>(); Regex regex = null; Article article = null; regex = new Regex("<div class=\"post_item\">(?<content>.*?)(?=<div class=\"clear\">" + @"</div>\s*</div>)", RegexOptions.Singleline); if (regex.IsMatch(htmlString)) { MatchCollection aritcles = regex.Matches(htmlString); foreach (Match item in aritcles) { article = new Article(); //取推荐 regex = new Regex( "<div class=\"digg\">.*<span.*>(?<digNum>.*)" + @"</span>" + ".*<div class=\"post_item_body\">", RegexOptions.Singleline); article.DiggNum = regex.Match(item.Value).Groups["digNum"].Value; //取文章标题 需要去除转义字符 regex = new Regex("<h3>(?<a>.*)</h3>", RegexOptions.Singleline); string a = regex.Match(item.Value).Groups["a"].Value; regex = new Regex("<a\\s.*href=\"(?<href>.*?)\".*>(?<summary>.*)</a>", RegexOptions.Singleline); article.AritcleUrl = regex.Match(a).Groups["href"].Value; article.AritcleTitle = regex.Match(a).Groups["summary"].Value; //取作者图片 regex = new Regex("<a.*>(?<img><img[^>].*>)</a>", RegexOptions.Singleline); article.AuthorImg = regex.Match(item.Value).Groups["img"].Value; //取作者博客URL及链接的target属性 regex = new Regex("<a\\s*?href=\"(?<href>.*)\"\\s*?target=\"(?<target>.*?)\">.*</a>", RegexOptions.Singleline); article.AuthorUrl = regex.Match(item.Value).Groups["href"].Value; string urlTarget = regex.Match(item.Value).Groups["target"].Value; //取文章简介 //1 先取summary Div中所有内容 regex = new Regex("<p class=\"post_item_summary\">(?<summary>.*)</p>", RegexOptions.Singleline); string summary = regex.Match(item.Value).Groups["summary"].Value; //2 取简介 regex = new Regex("(?<indroduct>(?<=</a>).*)", RegexOptions.Singleline); article.AritcleInto = regex.Match(summary).Groups["indroduct"].Value; //取发布人与发布时间 regex = new Regex( "<div class=\"post_item_foot\">\\s*<a.*?>(?<publishName>.*)</a>(?<publishTime>.*)<span class=\"article_comment\">", RegexOptions.Singleline); article.Author = regex.Match(item.Value).Groups["publishName"].Value; article.PublishTime = regex.Match(item.Value).Groups["publishTime"].Value.Trim(); //取评论数 regex = new Regex( "<span class=\"article_comment\"><a.*>(?<comment>.*)</a></span><span class=\"article_view\">", RegexOptions.Singleline); article.CommentNum = regex.Match(item.Value).Groups["comment"].Value; //取阅读数 regex = new Regex("<span\\s*class=\"article_view\"><a.*>(?<readNum>.*)</a>", RegexOptions.Singleline); article.ReadNum = regex.Match(item.Value).Groups["readNum"].Value; articleList.Add(article); } } return articleList; } public static string ClearSpecialTag(string htmlString) { string htmlStr = Regex.Replace(htmlString, "\n", "", RegexOptions.IgnoreCase); htmlStr = Regex.Replace(htmlStr, "\t", "", RegexOptions.IgnoreCase); htmlStr = Regex.Replace(htmlStr, "\r", "", RegexOptions.IgnoreCase); htmlStr = Regex.Replace(htmlStr, "\"", "'", RegexOptions.IgnoreCase); return htmlStr; } } public class Article { /// <summary> /// 文章标题 /// </summary> public string AritcleTitle { get; set; } /// <summary> /// 文章链接 /// </summary> public string AritcleUrl { get; set; } /// <summary> /// 文章简介 /// </summary> public string AritcleInto { get; set; } /// <summary> /// 作者名 /// </summary> public string Author { get; set; } /// <summary> /// 作者地址 /// </summary> public string AuthorUrl { get; set; } /// <summary> /// 作者图片 /// </summary> public string AuthorImg { get; set; } /// <summary> /// 发布时间 /// </summary> public string PublishTime { get; set; } /// <summary> /// 推荐数 /// </summary> public string DiggNum { get; set; } /// <summary> /// 评论数 /// </summary> public string CommentNum { get; set; } /// <summary> /// 阅读数 /// </summary> public string ReadNum { get; set; } }
正则部分可能写得不很完美,但至少也匹配出来了,另外因为自己也是刚接触正则,也只能写出这种比较简单的正则。还望大家海涵~~
五 总结