国产探花免费观看_亚洲丰满少妇自慰呻吟_97日韩有码在线_资源在线日韩欧美_一区二区精品毛片,辰东完美世界有声小说,欢乐颂第一季,yy玄幻小说排行榜完本

首頁 > 編程 > C# > 正文

C#網(wǎng)絡(luò)爬蟲代碼分享 C#簡單的爬取工具

2019-10-29 21:23:55
字體:
供稿:網(wǎng)友

公司編輯妹子需要爬取網(wǎng)頁內(nèi)容,叫我?guī)兔ψ隽艘缓唵蔚呐廊」ぞ?/p>

C#網(wǎng)絡(luò)爬蟲,C#爬取工具

這是爬取網(wǎng)頁內(nèi)容,像是這對(duì)大家來說都是不難得,但是在這里有一些小改動(dòng),代碼獻(xiàn)上,大家參考

private string GetHttpWebRequest(string url)     {       HttpWebResponse result;       string strHTML = string.Empty;       try       {         Uri uri = new Uri(url);         WebRequest webReq = WebRequest.Create(uri);         WebResponse webRes = webReq.GetResponse();          HttpWebRequest myReq = (HttpWebRequest)webReq;         myReq.UserAgent = "User-Agent:Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705";         myReq.Accept = "*/*";         myReq.KeepAlive = true;         myReq.Headers.Add("Accept-Language", "zh-cn,en-us;q=0.5");         result = (HttpWebResponse)myReq.GetResponse();         Stream receviceStream = result.GetResponseStream();         StreamReader readerOfStream = new StreamReader(receviceStream, System.Text.Encoding.GetEncoding("utf-8"));         strHTML = readerOfStream.ReadToEnd();         readerOfStream.Close();         receviceStream.Close();         result.Close();       }       catch       {         Uri uri = new Uri(url);         WebRequest webReq = WebRequest.Create(uri);         HttpWebRequest myReq = (HttpWebRequest)webReq;         myReq.UserAgent = "User-Agent:Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705";         myReq.Accept = "*/*";         myReq.KeepAlive = true;         myReq.Headers.Add("Accept-Language", "zh-cn,en-us;q=0.5");         //result = (HttpWebResponse)myReq.GetResponse();         try         {           result = (HttpWebResponse)myReq.GetResponse();         }         catch (WebException ex)         {           result = (HttpWebResponse)ex.Response;         }         Stream receviceStream = result.GetResponseStream();         StreamReader readerOfStream = new StreamReader(receviceStream, System.Text.Encoding.GetEncoding("gb2312"));         strHTML = readerOfStream.ReadToEnd();         readerOfStream.Close();         receviceStream.Close();         result.Close();       }       return strHTML;     }

這是根據(jù)url爬取網(wǎng)頁遠(yuǎn)嗎,有一些小改動(dòng),很多網(wǎng)頁有不同的編碼格式,甚至有些網(wǎng)站做了反爬取的防范,這個(gè)方法經(jīng)過能夠改動(dòng)也能爬去 

C#網(wǎng)絡(luò)爬蟲,C#爬取工具

以下是爬取網(wǎng)頁所有的網(wǎng)址鏈接

 /// <summary>     /// 提取HTML代碼中的網(wǎng)址     /// </summary>     /// <param name="htmlCode"></param>     /// <returns></returns>     private static List<string> GetHyperLinks(string htmlCode, string url)     {       ArrayList al = new ArrayList();       bool IsGenxin = false;       StringBuilder weburlSB = new StringBuilder();//SQL       StringBuilder linkSb = new StringBuilder();//展示數(shù)據(jù)       List<string> Weburllistzx = new List<string>();//新增       List<string> Weburllist = new List<string>();//舊的       string ProductionContent = htmlCode;       Regex reg = new Regex(@"http(s)?://([/w-]+/.)+[/w-]+/?");       string wangzhanyuming = reg.Match(url, 0).Value;       MatchCollection mc = Regex.Matches(ProductionContent.Replace("href=/"/", "href=/"" + wangzhanyuming).Replace("href='/", "href='" + wangzhanyuming).Replace("href=/", "href=" + wangzhanyuming).Replace("href=/"./", "href=/"" + wangzhanyuming), @"<[aA][^>]* href=[^>]*>", RegexOptions.Singleline);       int Index = 1;       foreach (Match m in mc)       {         MatchCollection mc1 = Regex.Matches(m.Value, @"[a-zA-z]+://[^/s]*", RegexOptions.Singleline);         if (mc1.Count > 0)         {           foreach (Match m1 in mc1)           {             string linkurlstr = string.Empty;             linkurlstr = m1.Value.Replace("/"", "").Replace("'", "").Replace(">", "").Replace(";", "");             weburlSB.Append("$-$");             weburlSB.Append(linkurlstr);             weburlSB.Append("$_$");             if (!Weburllist.Contains(linkurlstr) && !Weburllistzx.Contains(linkurlstr))             {               IsGenxin = true;               Weburllistzx.Add(linkurlstr);               linkSb.AppendFormat("{0}<br/>", linkurlstr);             }           }         }         else         {           if (m.Value.IndexOf("javascript") == -1)           {             string amstr = string.Empty;             string wangzhanxiangduilujin = string.Empty;             wangzhanxiangduilujin = url.Substring(0, url.LastIndexOf("/") + 1);             amstr = m.Value.Replace("href=/"", "href=/"" + wangzhanxiangduilujin).Replace("href='", "href='" + wangzhanxiangduilujin);             MatchCollection mc11 = Regex.Matches(amstr, @"[a-zA-z]+://[^/s]*", RegexOptions.Singleline);             foreach (Match m1 in mc11)             {               string linkurlstr = string.Empty;               linkurlstr = m1.Value.Replace("/"", "").Replace("'", "").Replace(">", "").Replace(";", "");               weburlSB.Append("$-$");               weburlSB.Append(linkurlstr);               weburlSB.Append("$_$");               if (!Weburllist.Contains(linkurlstr) && !Weburllistzx.Contains(linkurlstr))               {                 IsGenxin = true;                 Weburllistzx.Add(linkurlstr);                 linkSb.AppendFormat("{0}<br/>", linkurlstr);               }             }           }         }         Index++;       }       return Weburllistzx;     }

這塊的技術(shù)其實(shí)就是簡單的使用了正則去匹配!接下來獻(xiàn)上獲取標(biāo)題,以及存儲(chǔ)到xml文件的方法

/// <summary>     /// // 把網(wǎng)址寫入xml文件     /// </summary>     /// <param name="strURL"></param>     /// <param name="alHyperLinks"></param>     private static void WriteToXml(string strURL, List<string> alHyperLinks)     {       XmlTextWriter writer = new XmlTextWriter(@"D:/HyperLinks.xml", Encoding.UTF8);       writer.Formatting = Formatting.Indented;       writer.WriteStartDocument(false);       writer.WriteDocType("HyperLinks", null, "urls.dtd", null);       writer.WriteComment("提取自" + strURL + "的超鏈接");       writer.WriteStartElement("HyperLinks");       writer.WriteStartElement("HyperLinks", null);       writer.WriteAttributeString("DateTime", DateTime.Now.ToString());       foreach (string str in alHyperLinks)       {         string        string body = str;         writer.WriteElementString(title, null, body);       }       writer.WriteEndElement();       writer.WriteEndElement();       writer.Flush();       writer.Close();     }     /// <summary>     /// 獲取網(wǎng)址的域名后綴     /// </summary>     /// <param name="strURL"></param>     /// <returns></returns>     private static string GetDomain(string strURL)     {       string retVal;       string strRegex = @"(/.com/|/.net/|/.cn/|/.org/|/.gov/)";       Regex r = new Regex(strRegex, RegexOptions.IgnoreCase);       Match m = r.Match(strURL);       retVal = m.ToString();       strRegex = @"/.|/$";       retVal = Regex.Replace(retVal, strRegex, "").ToString();       if (retVal == "")         retVal = "other";       return retVal;     } /// <summary>     /// 獲取標(biāo)題     /// </summary>     /// <param name="html"></param>     /// <returns></returns>     private static string GetTitle(string html)     {       string titleFilter = @"<title>[/s/S]*?</title>";       string h1Filter = @"<h1.*?>.*?</h1>";       string clearFilter = @"<.*?>";        string      Match match = Regex.Match(html, titleFilter, RegexOptions.IgnoreCase);       if (match.Success)       {      }        // 正文的標(biāo)題一般在h1中,比title中的標(biāo)題更干凈       match = Regex.Match(html, h1Filter, RegexOptions.IgnoreCase);       if (match.Success)       {         string h1 = Regex.Replace(match.Groups[0].Value, clearFilter, "");         if (!String.IsNullOrEmpty(h1) && title.StartsWith(h1))         {        }       }       return title;     }

這就是所用的全部方法,還是有很多需要改進(jìn)之處!大家如果有發(fā)現(xiàn)不足之處還請(qǐng)指出,謝謝!

以上就是本文的全部內(nèi)容,希望對(duì)大家的學(xué)習(xí)有所幫助,也希望大家多多支持VEVB武林網(wǎng)。


注:相關(guān)教程知識(shí)閱讀請(qǐng)移步到c#教程頻道。
發(fā)表評(píng)論 共有條評(píng)論
用戶名: 密碼:
驗(yàn)證碼: 匿名發(fā)表
主站蜘蛛池模板: 大方县| 杭锦后旗| 黔东| 衡水市| 浦东新区| 应用必备| 云梦县| 黄山市| 通城县| 江孜县| 增城市| 九江县| 县级市| 太原市| 石门县| 文成县| 新疆| 瓦房店市| 麻栗坡县| 柳河县| 广东省| 仁寿县| 贵德县| 深州市| 新巴尔虎右旗| 雷波县| 岑巩县| 鲁甸县| 大竹县| 洪江市| 揭东县| 新安县| 蒙城县| 双辽市| 收藏| 海南省| 东山县| 云阳县| 宽甸| 定襄县| 临武县|