本文實例講述了C#使用正則表達式抓取網站信息的方法。分享給大家供大家參考,具體如下:
這里以抓取京東商城商品詳情為例。
1、創建JdRobber.cs程序類
public class JdRobber{ /// <summary> /// 判斷是否京東鏈接 /// </summary> /// <param name="param"></param> /// <returns></returns> public bool ValidationUrl(string url) { bool result = false; if (!String.IsNullOrEmpty(url)) { Regex regex = new Regex(@"^http://item.jd.com//d+.html$"); Match match = regex.Match(url); if (match.Success) { result = true; } } return result; } /// <summary> /// 抓取京東信息 /// </summary> /// <param name="param"></param> /// <returns></returns> public void GetInfo(string url) { if (ValidationUrl(url)) { string htmlStr = WebHandler.GetHtmlStr(url, "Default"); if (!String.IsNullOrEmpty(htmlStr)) { string pattern = ""; //正則表達式 string sourceWebID = ""; //商品關鍵ID string title = ""; //標題 decimal price = 0; //價格 string picName = ""; //圖片 //提取商品關鍵ID pattern = @"http://item.jd.com/(?<Object>/d+).html"; sourceWebID = WebHandler.GetRegexText(url, pattern); //提取標題 pattern = @"<div.*id=/""name/"".*>[/s/S]*<h1>(?<Object>.*?)</h1>"; title = WebHandler.GetRegexText(htmlStr, pattern); //提取圖片 int begin = htmlStr.IndexOf("<div id=/"spec-n1/""); int end = htmlStr.IndexOf("</div>", begin + 1); if (begin > 0 && end > 0) { string subPicHtml = htmlStr.Substring(begin, end - begin); pattern = @"<img.*src=/""(?<Object>.*?)/"".*/>"; picName = WebHandler.GetRegexText(subPicHtml, pattern); } //提取價格 if (sourceWebID != "") { string priceUrl = @"http://p.3.cn/prices/get?skuid=J_" + sourceWebID + "&type=1"; string priceJson = WebHandler.GetHtmlStr(priceUrl, "Default"); pattern = @"/""p/"":/""(?<Object>/d+(/./d{1,2})?)/"""; price = WebHandler.GetValidPrice(WebHandler.GetRegexText(priceJson, pattern)); } Console.WriteLine("商品名稱:{0}", title); Console.WriteLine("圖片:{0}", picName); Console.WriteLine("價格:{0}", price); } } }}2、創建WebHandler.cs公共方法類
/// <summary>/// 公共方法類/// </summary>public class WebHandler{ /// <summary> /// 獲取網頁的HTML碼 /// </summary> /// <param name="url">鏈接地址</param> /// <param name="encoding">編碼類型</param> /// <returns></returns> public static string GetHtmlStr(string url, string encoding) { string htmlStr = ""; try { if (!String.IsNullOrEmpty(url)) { WebRequest request = WebRequest.Create(url); //實例化WebRequest對象 WebResponse response = request.GetResponse(); //創建WebResponse對象 Stream datastream = response.GetResponseStream(); //創建流對象 Encoding ec = Encoding.Default; if (encoding == "UTF8") { ec = Encoding.UTF8; } else if (encoding == "Default") { ec = Encoding.Default; } StreamReader reader = new StreamReader(datastream, ec); htmlStr = reader.ReadToEnd(); //讀取數據 reader.Close(); datastream.Close(); response.Close(); } } catch { } return htmlStr; } /// <summary> /// 獲取正則表達式中的關鍵字 /// </summary> /// <param name="input">文本</param> /// <param name="pattern">表達式</param> /// <returns></returns> public static string GetRegexText(string input, string pattern) { string result = ""; if (!String.IsNullOrEmpty(input) && !String.IsNullOrEmpty(pattern)) { Regex regex = new Regex(pattern, RegexOptions.IgnoreCase); Match match = regex.Match(input); if (match.Success) { result = match.Groups["Object"].Value; } } return result; } /// <summary> /// 返回有效價格 /// </summary> /// <param name="strPrice"></param> /// <returns></returns> public static decimal GetValidPrice(string strPrice) { decimal price = 0; try { if (!String.IsNullOrEmpty(strPrice)) { Regex regex = new Regex(@"^/d+(/./d{1,2})?$", RegexOptions.IgnoreCase); Match match = regex.Match(strPrice); if (match.Success) { price = decimal.Parse(strPrice); } } } catch { } return price; }}PS:這里再為大家提供2款非常方便的正則表達式工具供大家參考使用:
JavaScript正則表達式在線測試工具:
http://tools.VeVB.COm/regex/javascript
正則表達式在線生成工具:
http://tools.VeVB.COm/regex/create_reg
更多關于C#相關內容感興趣的讀者可查看本站專題:《C#正則表達式用法總結》、《C#編碼操作技巧總結》、《C#中XML文件操作技巧匯總》、《C#常見控件用法教程》、《WinForm控件用法總結》、《C#數據結構與算法教程》、《C#面向對象程序設計入門教程》及《C#程序設計之線程使用技巧總結》
希望本文所述對大家C#程序設計有所幫助。
新聞熱點
疑難解答