国产探花免费观看_亚洲丰满少妇自慰呻吟_97日韩有码在线_资源在线日韩欧美_一区二区精品毛片,辰东完美世界有声小说,欢乐颂第一季,yy玄幻小说排行榜完本

首頁 > 學院 > 開發設計 > 正文

C#版采集程序源碼

2019-11-17 04:17:11
字體:
來源:轉載
供稿:網友

因為工作需要,自己寫了一個采集程序,如果冒犯了你的網站,我在這里說一聲對不起 !!
 哎~!我只是一個普通的程序員 .
namespace CJ
{
    public partial class Form1 : Form
    {
        public int PRoxy = 0;
        public int keyi = 0;
        public int keyj = 0;
        public int keym = 0;
        public int keyn = 0;
        public int sum = 0;

        public string newurl = "";
        public string cururl = "";
        public string dirname = "";
        public string curdir = "";

        public string responseFromServer = "";
        public string filename = "";
        public string sql = "";
        public string mulu = "";

        StringBuilder sbs = new StringBuilder();
        List<Class1> cls = new List<Class1>();
        public ArrayList al = new ArrayList();

        public string insertdl = "insert into mzinedl values(";
        public string insertxl = "insert into mzinexl values(";
        public string insertinfo = "insert into mzineinfo values(";
        public string insertwz = "insert into mzinewz values(";

        public Form1()
        {
            InitializeComponent();
        }

        /// <summary>
        ///  保存網頁
        /// </summary>
        /// <param name="FILE_NAME">文件的路徑</param>
        /// <param name="data">數據</param>
        public void TextToFile(string FILE_NAME, string data)
        {
            if (File.Exists(FILE_NAME))
            {
                return;
            }
            using (StreamWriter sw = File.CreateText(FILE_NAME))
            {
                sw.Write(data);
                sw.Close();
            }
        }
        /// <summary>
        ///  下載文件
        /// </summary>
        /// <param name="PageUrl">網址</param>
        /// <param name="filename">保存文件路徑</param>
        public void DownFile(string PageUrl, string filename)
        {
            if (!Directory.Exists(filename))
            {
                Directory.CreateDirectory(filename);
            }
            string  path = PageUrl.Substring(PageUrl.LastIndexOf("/") + 1);
            string dirname = filename + "http://" + path;


            if (File.Exists(dirname))
            {
                return;
            }
            else
            {
                try
                {                   
                    WebClient wc = new WebClient();
                    WebProxy wp = new WebProxy(al[proxy].ToString(), true);
                    wc.Proxy = wp;
                    wc.DownloadFile(PageUrl, dirname);
                }
                catch (WebException ex)
                {
                    if (ex.Status == WebExceptionStatus.ConnectFailure)
                    {
                        //無法連接到遠程服務器, --換代理 ip
                        //MessageBox.Show(ex.ToString());
                        proxy++;
                        if (proxy >= al.Count)
                        {
                            al = ReadIPproxy("e://test.txt");//初始化代理   IP
                        }
                        DownFile(PageUrl, filename);
                    }
                    else if (ex.Status == WebExceptionStatus.Timeout)
                    {
                        //超時 --換代理 IP
                        //MessageBox.Show(ex.ToString());
                        proxy++;
                        if (proxy >= al.Count)
                        {
                            al = ReadIPproxy("e://test.txt");//初始化代理   IP
                        }
                        DownFile(PageUrl, filename);
                    }
                    else if (ex.Status == WebExceptionStatus.ProtocolError)
                    {
                        //文件未找到--跳出
                        //MessageBox.Show(ex.ToString());
                        return;
                    }
                }
            }

        }
        /// <summary>
        /// 讀文件
        /// </summary>
        /// <param name="FILE_NAME">文件的路徑</param>
        /// <returns>數據</returns>
        public ArrayList ReadIPproxy(string FILE_NAME)
        {   
            using (StreamReader sr = File.OpenText(FILE_NAME))
            {
                String input;
                while ((input = sr.ReadLine()) != null)
                {
                    al.Add(input);
                }         
                sr.Close();
            }
            return al;

        }
        /// <summary>
        /// 數據庫
        /// </summary>
        public void Executesql()
        {
            SqlHelper.ExecuteNonQuery(SqlHelper.sqlstr, CommandType.Text, sbs.ToString(), null);
        }
        /// <summary>
        /// 讀文件
        /// </summary>
        /// <param name="FILE_NAME">文件的路徑</param>
        /// <returns>數據</returns>
        public string FileToText(string FILE_NAME)
        {
            string data;
            using (StreamReader sr = File.OpenText(FILE_NAME))
            {
                data=sr.ReadToEnd();
                sr.Close();
            }
            return data;
        }
        /// <summary>
        /// 保存SQL
        /// </summary>
        /// <param name="sql"></param>
        public void SaveSqls(string sql)
        {
            sbs.Append(sql).Append("/n");
        }   
        /// <summary>
        ///  請求失敗的時候,反復操作
        /// </summary>
        /// <param name="PageUrl"></param>
        /// <returns></returns>
        public string ToServer(string PageUrl)
        {
            string responseFromServer = "";
         
            try
            {               
                while (1 == 1)
                {
                    WebRequest request = WebRequest.Create(PageUrl);
                    WebProxy wp = new WebProxy(al[proxy].ToString(), true);
                    request.Proxy = wp;
                    request.Timeout = 1000 * 60;

                    HttpWebResponse response = (HttpWebResponse)request.GetResponse();
                    Stream dataStream = response.GetResponseStream();
                    StreamReader reader=null;
                    try
                    {
                        reader = new StreamReader(dataStream, System.Text.Encoding.Default);
                        responseFromServer = reader.ReadToEnd();
                    }
                    catch
                    {
                        proxy++;
                        if (proxy >= al.Count)
                        {
                            al = ReadIPproxy("e://test.txt");//初始化代理   IP
                        }
                        ToServer(PageUrl);
                    };
                    reader.Close();
                    dataStream.Close();
                    response.Close();
                    if (responseFromServer.Contains("refresh") || responseFromServer == "")
                    {
                        proxy++;
                        if (proxy >= al.Count)
                        {
                            al = ReadIPproxy("e://test.txt");//初始化代理   IP
                        }
                        //ToServer(PageUrl);
                    }
                    else
                    {
                        break;
                    }
                }
            }
            catch (WebException ex)
            {               
                if (ex.Status == WebExceptionStatus.ProtocolError)
                {                  
                    responseFromServer = "";
                }
                else
                {
                    proxy++;
                    if (proxy >= al.Count)
                    {
                        al = ReadIPproxy("e://test.txt");//初始化代理   IP
                    }
                    ToServer(PageUrl);
                }
            }          
            return responseFromServer;
        }
        /// <summary>
        /// 保存xml 文件
        /// </summary>
        public void SaveXmls()
        {
            string pathxml = "";
            foreach (Class1 c in cls)
            {
                Class1 s = c;
                pathxml = s.address;

                if (!File.Exists(pathxml))
                {
                    XmlSerializer xs = new XmlSerializer(typeof(Class1));
                    Stream stream = new FileStream(pathxml, FileMode.Create, Fileaccess.Write, FileShare.ReadWrite);
                    xs.Serialize(stream, s);
                    stream.Close();
                }            
              
            }
        }
        /// <summary>
        ///  移除HTMl 標記
        /// </summary>
        /// <param name="Html"></param>
        /// <param name="RegStr"></param>
        /// <returns></returns>
        public static string Remove(string Html)
        {
            //Regex Reg = new Regex(RegStr);
            //foreach (Match m in Reg.Matches(Html))
            //{
            //    Html = Html.Replace(m.Value, "");
            //}
            //return Html.Trim();
            string regesstr = "<.*?>";
            return Regex.Replace(Html, regesstr, string.Empty, RegexOptions.IgnoreCase);
        }
        public static string FilterScript(string content)
        {
            string regexstr = @"<(script)[^>]*>(/s*|.)*<//1>";
            return Regex.Replace(content,regexstr,string.Empty,RegexOptions.IgnoreCase);

        }
        /// <summary>
        /// 過略所有的 危險標記
        /// </summary>
        /// <param name="html"></param>
        /// <returns></returns>
        public string wipeScript(string html)
        {
            System.Text.RegularExpressions.Regex regex1 = new System.Text.RegularExpressions.Regex(@"(<script){1,}[^<>]*>[^/0]*(<//script>){1,}", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
            System.Text.RegularExpressions.Regex regex2 = new System.Text.RegularExpressions.Regex(@"href   *=   *[/s/S]*script   *:", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
            System.Text.RegularExpressions.Regex regex3 = new System.Text.RegularExpressions.Regex(@"on[/s/S]*=", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
            System.Text.RegularExpressions.Regex regex4 = new System.Text.RegularExpressions.Regex(@"<iframe[/s/S]+</iframe*>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
            System.Text.RegularExpressions.Regex regex5 = new System.Text.RegularExpressions.Regex(@"<frameset[/s/S]+</frameset*>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
            html = regex1.Replace(html, "");   //過濾<script></script>標記  
            html = regex2.Replace(html, "");   //過濾href=javascript:   (<A>)   屬性  
            html = regex3.Replace(html, "   _disibledevent=");   //過濾其它控件的on...事件  
            html = regex4.Replace(html, "");   //過濾iframe  
            html = regex5.Replace(html, "");   //過濾frameset  
            return html;
        }
        public void HtmlSource(string urlpri)
        {
            //要寫入的文件路徑
            filename = "E://觀2//magazine.html";

            if (!Directory.Exists("E://觀2"))
            {
                Directory.CreateDirectory("E://觀2");
            }
            if (File.Exists(filename))
            {
                responseFromServer=FileToText(filename); //存在
               
            }
            else
            {
                responseFromServer = ToServer(urlpri); //不存在
               
            }
            sum++;
            if (responseFromServer != "")
            {
                //分析內容
                TextToFile(filename,responseFromServer);

                MatchCollection mc = Regex.Matches(responseFromServer, @"href=""/magazine/(.*)""><b>(.*)</b>", RegexOptions.IgnoreCase);
                foreach (Match m in mc)
                {
                    newurl = m.Groups[1].Value;
                    dirname = m.Groups[2].Value;

                    int key = ++keyi;
                    sql = insertdl + key + ",'" + dirname + "')";
                    SaveSqls(sql);

                    cururl = urlpri + newurl;
                    curdir = "E://觀2//" + dirname;

                    one(cururl, curdir,key);
                }
                SaveXmls();
                Executesql();               
               
                this.textBox1.Text = sum.ToString();
                MessageBox.Show("采集成功!");
            }
        }
        public void one(string urlpri,string _dirname,int _key)
        {
            //要寫入的文件路徑
            filename = _dirname +"http://"+ urlpri.Substring(urlpri.LastIndexOf("/") + 1);

            if (!Directory.Exists(_dirname))
            {
                Directory.CreateDirectory(_dirname);
            }
            if (File.Exists(filename))
            {
                responseFromServer = FileToText(filename);
            }
            else
            {
                responseFromServer = ToServer(urlpri);
            }
            sum++;
            if (responseFromServer != "")
            {
                TextToFile(filename, responseFromServer);

                MatchCollection mc = Regex.Matches(responseFromServer, @"href=""/././(.*list.html)""[/s/S]*?《(.*?)》", RegexOptions.IgnoreCase);

                foreach (Match m in mc)
                {
                    newurl = m.Groups[1].Value;
                    dirname = m.Groups[2].Value;

                    cururl = "                    curdir = _dirname + "http://" + dirname;

                    two(cururl, curdir, _key);

                }                              
            }          
        }
        public void two(string urlpri,string _dirname,int _key)
        {
            filename = urlpri.Substring(0, urlpri.LastIndexOf("/"));
            filename = filename.Substring(filename.LastIndexOf("/") + 1) + ".html";
            filename = _dirname + "http://" + filename;

            if (!Directory.Exists(_dirname))
            {
                Directory.CreateDirectory(_dirname);
            }

            if (File.Exists(filename))
            {
                responseFromServer = FileToText(filename);
            }
            else
            {
                responseFromServer = ToServer(urlpri);
            }
            sum++;
            if (responseFromServer != "")
            {
                TextToFile(filename, responseFromServer);

                Match mc = Regex.Match(responseFromServer, @"刊/s+期:(.*?)<br>[/s/S]*?編/s+輯:(.*?)<br>[/s/S]*?出/s+版: (.*?)<br>[/s/S]*?聯系電話:(.*?)<br>[/s/S]*?E-mail: (.*?)<br>[/s/S]*?社/s+址:(.*?)<br>[/s/S]*?郵/s+編: (.*?)<br>[/s/S]*?郵發代號:(.*?)<br>[/s/S]*?國外發行代號: (.*?)<br>[/s/S]*?國際標準刊號:(.*?)<br>[/s/S]*?國內統一刊號: (.*?)</td>", RegexOptions.IgnoreCase);
                Match content = Regex.Match(responseFromServer, @"刊/s+物/s+簡/s+介/s+:::...([/s/S]*?)...:::/s+收錄期號列表", RegexOptions.Multiline);
                int key = ++keyj;
                sql = insertxl + keyj + "," + _key + ",'" + dirname + "','" + mc.Groups[1].Value + "','" + mc.Groups[2].Value + "','" +
                       mc.Groups[3].Value + "','" + mc.Groups[4].Value + "','" + mc.Groups[5].Value + "','" + mc.Groups[6].Value + "','" +
                       mc.Groups[7].Value + "','" + mc.Groups[8].Value + "','" + mc.Groups[9].Value + "','" + mc.Groups[10].Value + "','" + mc.Groups[11].Value + "','" + Remove(content.Groups[1].Value) + "')";
                SaveSqls(sql);

                MatchCollection mc2 = Regex.Matches(responseFromServer, @"href='(.*?)'/s+target.*>(.*?)</a>", RegexOptions.IgnoreCase);
                foreach (Match m2 in mc2)
                {
                    newurl = m2.Groups[1].Value;
                    dirname = m2.Groups[2].Value.Replace("年", "-").Replace("第", "").Replace("期", "");

                    cururl = urlpri.Substring(0, urlpri.LastIndexOf("/") + 1) + newurl;
                    curdir = _dirname + "http://" + dirname;

                    three(cururl, curdir,key,dirname);
                }
               
            }          
        }
        public void three(string urlpri,string _dirname,int _key,string qishu)
        {
            //要寫入的文件路徑
            filename = _dirname + "http://" + urlpri.Substring(urlpri.LastIndexOf("/") + 1);

            if (!Directory.Exists(_dirname))
            {
                Directory.CreateDirectory(_dirname);
            }

            if (File.Exists(filename))
            {
                responseFromServer = FileToText(filename);
            }
            else
            {
                responseFromServer = ToServer(urlpri);
            }
            sum++;
            if (responseFromServer != "")
            {
                TextToFile(filename, responseFromServer);

                Match m = Regex.Match(responseFromServer, @"src='face_(.*?)'", RegexOptions.IgnoreCase);
                string photoName = "";
                if (m.Groups[1].Value.Trim() != "")
                {
                    photoName = urlpri.Substring(0, urlpri.LastIndexOf("/") + 1) +"face_" + m.Groups[1].Value;
                    DownFile(photoName, _dirname);
                  
                }
                int key = ++keym;
                sql = insertinfo + key + "," + _key + ",'" + qishu + "','" + _dirname +"http://"+ "face_" + m.Groups[1].Value + "')";
                SaveSqls(sql);

                MatchCollection mc2 = Regex.Matches(responseFromServer, @"href='(/d+.html?)'[/s/S]*?<font/s+color=black>(.*?)</a>|& lt;font[^>]*?>[(.+?)]", RegexOptions.IgnoreCase);
                foreach (Match m2 in mc2)
                {
                    newurl = m2.Groups[1].Value;

                    string muName = m2.Groups[3].Value;
                    if (muName == "")
                    {
                        muName = mulu;
                    }
                    string lstr = m2.Groups[2].Value;
                    string s1 = "";
                    string s2 = "";
                    if (lstr != "")
                    {
                        if (lstr.Contains("."))
                        {
                            s1 = lstr.Substring(0, lstr.IndexOf("."));
                            s2 = lstr.Substring(lstr.LastIndexOf(".") + 1);
                        }
                        else
                        {
                            s1 = lstr;
                            s2 = "";
                        }
                        int k2 = ++keyn;
                        sql = insertwz + k2 + "," + key + ",'" + muName + "','" + s1 + "','" + s2 + "')";
                        SaveSqls(sql);

                        cururl = urlpri.Substring(0, urlpri.LastIndexOf("/") + 1) + newurl;
                        curdir = _dirname;
                        four(cururl, curdir,k2);

                    }
                    mulu = muName;
                }              
            }
        }      
        public void four(string urlpri,string _dirname,int _key)
        {
            filename = _dirname + "http://" + urlpri.Substring(urlpri.LastIndexOf("/") + 1);

            if (!Directory.Exists(_dirname))
            {
                Directory.CreateDirectory(_dirname);
            }
            if (File.Exists(filename))
            {
                responseFromServer = FileToText(filename);
            }
            else
            {
                responseFromServer = ToServer(urlpri);
            }
            sum++;
            if (responseFromServer != "")
            {
                TextToFile(filename, responseFromServer);

                //分析內容
                Match m = Regex.Match(responseFromServer, @"正文開始-->(?<text>[/s/S]*?)<!--正文結束", RegexOptions.IgnoreCase);
                string content = m.Groups["text"].Value; //得到正文的所有內容
                string c = FilterScript(content);
                c = Remove(c);  //得到過濾后的正文內容
               // Match ms = Regex.Match(c, @"正文開始-->(?<text>[/s/S]*?)<!--正文結束", RegexOptions.IgnoreCase);
              
              
                //設置要保存的XML 文件的名稱
                string xmlname = urlpri.Substring(urlpri.LastIndexOf("/") + 1, urlpri.LastIndexOf(".") - urlpri.LastIndexOf("/"));
                string pathxml = _dirname + "http://" + xmlname + "xml";  //將路徑 和名字一起傳過去

                Class1 cs = new Class1(_key, c, pathxml);
                cls.Add(cs);
                //序列化成功
                MatchCollection mc = Regex.Matches(responseFromServer, @"(<img/s+src=""(?<imgs>.*)""/s+hspace|HreF=""([^>]*PDF)"")", RegexOptions.IgnoreCase);
                foreach (Match m2 in mc)
                {
                    string imgurl = m2.Groups["imgs"].Value.Trim(); //得到單個圖片的名稱
                    string zhuurl = urlpri.Substring(0, urlpri.LastIndexOf("/") + 1);
                    if (imgurl != "")
                    {
                        string jurl = zhuurl + imgurl; //得到圖片的絕對路徑
                        DownFile(jurl, _dirname);
       
                    }
                    string pdfurl = m2.Groups["pdfs"].Value.Trim(); //得到單個PDF 的名稱
                    if (pdfurl != "")
                    {
                        string jurl = zhuurl + pdfurl; //得到 pdf 的絕對路徑  
                        DownFile(jurl, _dirname);
                                         
                    }
                }
            }
        }  
        private void btnOK_Click(object sender, EventArgs e)
        {
            al = ReadIPproxy("e://test.txt");//初始化代理   IP
            HtmlSource("
        }

        private void button1_Click(object sender, EventArgs e)
        {
           
application.Exit();           
        }
      
    }
}


發表評論 共有條評論
用戶名: 密碼:
驗證碼: 匿名發表
主站蜘蛛池模板: 乐昌市| 红原县| 长顺县| 岗巴县| 香港| 龙口市| 恩施市| 额尔古纳市| 申扎县| 唐山市| 曲阜市| 清河县| 杨浦区| 景泰县| 延安市| 洪湖市| 樟树市| 长阳| 扶风县| 象州县| 定远县| 自贡市| 奇台县| 乌审旗| 浙江省| 逊克县| 永州市| 富裕县| 江都市| 芜湖市| 双辽市| 南京市| 海淀区| 年辖:市辖区| 慈溪市| 綦江县| 那坡县| 韩城市| 自治县| 隆林| 区。|