zoukankan      html  css  js  c++  java
  • 自己用C#写一个采集器、蜘蛛

    using System;
    
    using System.Collections.Generic;
    
    using System.Text;
    
    using System.Net;
    
    using System.Web;
    
    using System.IO;
    
    using System.Collections;
    
    using System.Text.RegularExpressions;
    
    
    
    
    
    namespace chinaz
    
    {
    
        class Program
    
        {
    
            static void Main(string[] args)
    
            {
    
    
    
                string cookie = null;
    
                using (StreamReader sr = new StreamReader("cookie.txt"))
    
                {
    
                    cookie = sr.ReadToEnd();
    
                    sr.Close();
    
                }
    
                //string tmp = SRWebClient.GetPage("http://bbs.chinaz.com/Members.html?page=1&sort=CreateDate&desc=true&keyword=", Encoding.UTF8, cookie);
    
                int a = int.Parse(Console.ReadLine());
    
                int b = int.Parse(Console.ReadLine());
    
                string url = Console.ReadLine();
    
    
    
                Hashtable hash = new Hashtable();
    
                Encoding encoding = Encoding.GetEncoding(Console.ReadLine());
    
    
    
                for (int i = a; i <= b; i++)
    
                {
    
                    string html = SRWebClient.GetPage(string.Format(url, i), encoding, cookie);
    
                    //Console.WriteLine(html);
    
                    if (html != null && html.Length > 1000)
    
                    {
    
                        Match m = Regex.Match(html, @"w+([-+.']w+)*@w+([-.]w+)*.w+([-.]w+)*", RegexOptions.Compiled | RegexOptions.IgnoreCase);
    
                        while (m != null && m.Value != null && m.Value.Trim() != string.Empty)
    
                        {
    
                            if (!hash.Contains(m.Value))
    
                            {
    
                                Console.WriteLine(m.Value);
    
                                using (StreamWriter sw = new StreamWriter("mail.txt", true))
    
                                {
    
                                    sw.WriteLine(m.Value);
    
                                    sw.Close();
    
                                }
    
                                hash.Add(m.Value, string.Empty);
    
                            }
    
                            m = m.NextMatch();
    
                        }
    
    
    
                    }
    
                }
    
    
    
    
    
    
    
                Console.Write("完成");
    
                Console.ReadLine();
    
            }
    
        }
    
    
    
    
    
        public class SRWebClient
    
        {
    
            public CookieCollection cookie;
    
            public SRWebClient()
    
            {
    
                cookie = null;
    
            }
    
    
    
            #region 从包含多个 Cookie 的字符串读取到 CookieCollection 集合中
    
            private static void AddCookieWithCookieHead(ref CookieCollection cookieCol, string cookieHead, string defaultDomain)
    
            {
    
                if (cookieCol == null) cookieCol = new CookieCollection();
    
                if (cookieHead == null) return;
    
                string[] ary = cookieHead.Split(';');
    
                for (int i = 0; i < ary.Length; i++)
    
                {
    
                    Cookie ck = GetCookieFromString(ary[i].Trim(), defaultDomain);
    
                    if (ck != null)
    
                    {
    
                        cookieCol.Add(ck);
    
                    }
    
                }
    
            }
    
            #endregion
    
    
    
            #region 读取某一个 Cookie 字符串到 Cookie 变量中
    
            private static Cookie GetCookieFromString(string cookieString, string defaultDomain)
    
            {
    
                string[] ary = cookieString.Split(',');
    
                Hashtable hs = new Hashtable();
    
                for (int i = 0; i < ary.Length; i++)
    
                {
    
                    string s = ary[i].Trim();
    
                    int index = s.IndexOf("=");
    
                    if (index > 0)
    
                    {
    
                        hs.Add(s.Substring(0, index), s.Substring(index + 1));
    
                    }
    
                }
    
                Cookie ck = new Cookie();
    
                foreach (object Key in hs.Keys)
    
                {
    
                    if (Key.ToString() == "path") ck.Path = hs[Key].ToString();
    
    
    
                    else if (Key.ToString() == "expires")
    
                    {
    
                        //ck.Expires=DateTime.Parse(hs[Key].ToString();
    
                    }
    
                    else if (Key.ToString() == "domain") ck.Domain = hs[Key].ToString();
    
                    else
    
                    {
    
                        ck.Name = Key.ToString();
    
                        ck.Value = hs[Key].ToString();
    
                    }
    
                }
    
                if (ck.Name == "") return null;
    
                if (ck.Domain == "") ck.Domain = defaultDomain;
    
                return ck;
    
            }
    
            #endregion
    
    
    
    
    
    
    
            /**/
    
            /// <TgData>
    
            ///     <Alias>下载Web源代码</Alias>
    
            /// </TgData>
    
            public string DownloadHtml(string URL, bool CreateCookie)
    
            {
    
                try
    
                {
    
                    HttpWebRequest request = HttpWebRequest.Create(URL) as HttpWebRequest;
    
                    if (cookie != null)
    
                    {
    
                        request.CookieContainer = new CookieContainer();
    
                        request.CookieContainer.Add(cookie);
    
                    }
    
                    request.AllowAutoRedirect = false;
    
                    //request.MaximumAutomaticRedirections = 3;
    
                    request.Timeout = 20000;
    
    
    
                    HttpWebResponse res = (HttpWebResponse)request.GetResponse();
    
                    string r = "";
    
    
    
                    System.IO.StreamReader S1 = new System.IO.StreamReader(res.GetResponseStream(), System.Text.Encoding.Default);
    
                    try
    
                    {
    
                        r = S1.ReadToEnd();
    
                        if (CreateCookie)
    
                            cookie = res.Cookies;
    
                    }
    
                    catch (Exception er)
    
                    {
    
                        //Log l = new Log();
    
                        //l.writelog("下载Web错误", er.ToString());
    
                    }
    
                    finally
    
                    {
    
                        res.Close();
    
                        S1.Close();
    
                    }
    
    
    
                    return r;
    
                }
    
    
    
                catch
    
                {
    
    
    
                }
    
    
    
                return string.Empty;
    
            }
    
    
    
            /**/
    
            /// <TgData>
    
            ///     <Alias>下载文件</Alias>
    
            /// </TgData>
    
            public long DownloadFile(string FileURL, string FileSavePath, bool CreateCookie)
    
            {
    
                long Filelength = 0;
    
                HttpWebRequest req = HttpWebRequest.Create(FileURL) as HttpWebRequest;
    
    
    
                if (cookie != null)
    
                {
    
                    req.CookieContainer = new CookieContainer();
    
                    req.CookieContainer.Add(cookie);
    
                }
    
                req.AllowAutoRedirect = true;
    
    
    
                HttpWebResponse res = req.GetResponse() as HttpWebResponse;
    
                if (CreateCookie)
    
                    cookie = res.Cookies;
    
                System.IO.Stream stream = res.GetResponseStream();
    
                try
    
                {
    
                    Filelength = res.ContentLength;
    
    
    
                    byte[] b = new byte[512];
    
    
    
                    int nReadSize = 0;
    
                    nReadSize = stream.Read(b, 0, 512);
    
    
    
                    System.IO.FileStream fs = System.IO.File.Create(FileSavePath);
    
                    try
    
                    {
    
                        while (nReadSize > 0)
    
                        {
    
                            fs.Write(b, 0, nReadSize);
    
                            nReadSize = stream.Read(b, 0, 512);
    
                        }
    
                    }
    
                    finally
    
                    {
    
                        fs.Close();
    
                    }
    
                }
    
                catch (Exception er)
    
                {
    
                    //Log l = new Log();
    
                    //l.writelog("下载文件错误", er.ToString());
    
                }
    
                finally
    
                {
    
                    res.Close();
    
                    stream.Close();
    
                }
    
    
    
                return Filelength;
    
            }
    
    
    
            /**/
    
            /// <TgData>
    
            ///     <Alias>提交数据</Alias>
    
            /// </TgData>
    
            public string Request(string RequestPageURL, RequestData Data, bool CreateCookie)
    
            {
    
                StreamReader reader = null;
    
                HttpWebResponse response = null;
    
                HttpWebRequest request = null;
    
                try
    
                {
    
                    string StrUrl = RequestPageURL;
    
                    request = HttpWebRequest.Create(StrUrl) as HttpWebRequest;
    
    
    
                    string postdata = Data.GetData();
    
                    request.Referer = RequestPageURL;
    
                    request.AllowAutoRedirect = false;
    
                    request.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; Maxthon; .NET CLR 1.1.4322; .NET CLR 2.0.50727)";
    
                    request.Timeout = 20000;
    
    
    
                    if (cookie != null)
    
                    {
    
                        request.CookieContainer = new CookieContainer();
    
                        request.CookieContainer.Add(cookie);
    
                    }
    
    
    
                    Uri u = new Uri(StrUrl);
    
    
    
                    if (postdata.Length > 0) //包含要提交的数据 就使用Post方式
    
                    {
    
                        request.ContentType = "application/x-www-form-urlencoded"; //作为表单请求
    
                        request.Method = "POST";        //方式就是Post
    
    
    
                        //把提交的数据换成字节数组
    
                        Byte[] B = System.Text.Encoding.UTF8.GetBytes(postdata);
    
                        request.ContentLength = B.Length;
    
    
    
                        System.IO.Stream SW = request.GetRequestStream(); //开始提交数据
    
                        SW.Write(B, 0, B.Length);
    
                        SW.Close();
    
                    }
    
    
    
                    response = request.GetResponse() as HttpWebResponse;
    
                    if (CreateCookie)
    
                        //cookie = response.Cookies;
    
                        AddCookieWithCookieHead(ref cookie, response.Headers["Set-Cookie"], request.RequestUri.Host);
    
                    reader = new StreamReader(response.GetResponseStream(), Encoding.Default);
    
    
    
                    return reader.ReadToEnd();
    
                }
    
                catch (Exception ex)
    
                {
    
                    string x = ex.StackTrace;
    
                }
    
                finally
    
                {
    
                    if (response != null)
    
                        response.Close();
    
                }
    
    
    
                return string.Empty;
    
            }
    
    
    
    
    
            public bool PostDownload(RequestData Data, out string file)
    
            {
    
                file = null;
    
                StreamReader reader = null;
    
                HttpWebResponse response = null;
    
                HttpWebRequest request = null;
    
                try
    
                {
    
                    string StrUrl = "http://www.imobile.com.cn/wapdiyringdownload.php";
    
                    request = HttpWebRequest.Create(StrUrl) as HttpWebRequest;
    
    
    
                    string postdata = Data.GetData();
    
                    request.Referer = StrUrl;
    
                    request.AllowAutoRedirect = false;
    
                    request.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; Maxthon; .NET CLR 1.1.4322; .NET CLR 2.0.50727)";
    
                    request.Timeout = 20000;
    
    
    
                    if (cookie != null)
    
                    {
    
                        request.CookieContainer = new CookieContainer();
    
                        request.CookieContainer.Add(cookie);
    
                    }
    
    
    
                    Uri u = new Uri(StrUrl);
    
    
    
                    if (postdata.Length > 0) //包含要提交的数据 就使用Post方式
    
                    {
    
                        request.ContentType = "application/x-www-form-urlencoded"; //作为表单请求
    
                        request.Method = "POST";        //方式就是Post
    
    
    
                        //把提交的数据换成字节数组
    
                        Byte[] B = System.Text.Encoding.UTF8.GetBytes(postdata);
    
                        request.ContentLength = B.Length;
    
    
    
                        System.IO.Stream SW = request.GetRequestStream(); //开始提交数据
    
                        SW.Write(B, 0, B.Length);
    
                        SW.Close();
    
                    }
    
    
    
                    response = request.GetResponse() as HttpWebResponse;
    
                    string des = response.Headers["Content-Disposition"].Trim();
    
                    file = des.Substring(des.IndexOf("filename=") + 9);
    
                    file = new Random().Next(100).ToString() + "/" + file;
    
    
    
                    System.IO.Stream stream = response.GetResponseStream();
    
                    try
    
                    {
    
                        int Filelength = (int)response.ContentLength;
    
    
    
                        byte[] b = new byte[512];
    
    
    
                        int nReadSize = 0;
    
                        nReadSize = stream.Read(b, 0, 512);
    
    
    
                        System.IO.FileStream fs = System.IO.File.Create("f:/mobileMusic/" + file);
    
                        try
    
                        {
    
                            while (nReadSize > 0)
    
                            {
    
                                fs.Write(b, 0, nReadSize);
    
                                nReadSize = stream.Read(b, 0, 512);
    
                            }
    
                        }
    
                        finally
    
                        {
    
                            fs.Close();
    
                        }
    
                    }
    
                    catch (Exception er)
    
                    {
    
                        //Log l = new Log();
    
                        //l.writelog("下载文件错误", er.ToString());
    
                    }
    
                    finally
    
                    {
    
                        response.Close();
    
                        stream.Close();
    
                    }
    
                }
    
                catch (Exception ex)
    
                {
    
                    string x = ex.StackTrace;
    
                }
    
                finally
    
                {
    
                    if (response != null)
    
                        response.Close();
    
                }
    
                return true;
    
            }
    
            #region GetPage
    
            /// <summary>
    
            /// 获取源代码
    
            /// </summary>
    
            /// <param name="url"></param>
    
            /// <param name="coding"></param>
    
            /// <param name="TryCount"></param>
    
            /// <returns></returns>
    
            public static string GetPage(string url, Encoding encoding, int TryCount)
    
            {
    
                for (int i = 0; i < TryCount; i++)
    
                {
    
                    string result = GetPage(url, encoding, null);
    
                    if (result != null && result != string.Empty)
    
                        return result;
    
                }
    
    
    
                return string.Empty;
    
            }
    
    
    
            /// <summary>
    
            /// 获取源代码
    
            /// </summary>
    
            /// <param name="url"></param>
    
            /// <param name="coding"></param>
    
            /// <returns></returns>
    
            public static string GetPage(string url, Encoding encoding, string cookie)
    
            {
    
                HttpWebRequest request = null;
    
                HttpWebResponse response = null;
    
                StreamReader reader = null;
    
                try
    
                {
    
                    request = (HttpWebRequest)WebRequest.Create(url);
    
                    request.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2;)";
    
                    request.Timeout = 20000;
    
                    request.AllowAutoRedirect = false;
    
                    if (cookie != null)
    
                        request.Headers["Cookie"] = cookie;
    
    
    
                    response = (HttpWebResponse)request.GetResponse();
    
                    if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
    
                    {
    
                        reader = new StreamReader(response.GetResponseStream(), encoding);
    
                        string html = reader.ReadToEnd();
    
    
    
                        return html;
    
                    }
    
                }
    
                catch
    
                {
    
                }
    
                finally
    
                {
    
    
    
                    if (response != null)
    
                    {
    
                        response.Close();
    
                        response = null;
    
                    }
    
                    if (reader != null)
    
                        reader.Close();
    
    
    
                    if (request != null)
    
                        request = null;
    
    
    
                }
    
    
    
                return string.Empty;
    
            }
    
            #endregion
    
        }
    
    
    
        public class RequestData
    
        {
    
            Hashtable hash = new Hashtable();
    
    
    
            public RequestData()
    
            {
    
    
    
            }
    
    
    
            public string GetData()
    
            {
    
                string r = "";
    
    
    
                foreach (string key in hash.Keys)
    
                {
    
                    if (r.Length > 0) r += "&";
    
                    r += key + "=" + hash[key];
    
                }
    
    
    
                return r;
    
            }
    
    
    
            public void AddField(string Field, string Value)
    
            {
    
                hash[Field] = Value;
    
            }
    
    
    
    
    
        }
    
    }
    

      

  • 相关阅读:
    TCP 的那些事儿(下)
    如何获取(GET)一杯咖啡——星巴克REST案例分析
    前端必读:浏览器内部工作原理
    伟大的程序员是怎样炼成的?
    从用户行为打造活动交互设计闭环——2014年世界杯竞猜活动设计总结
    技术普及帖:你刚才在淘宝上买了一件东西
    什么是互联网思维?给你最全面的解释
    程序员生存定律-打造属于自己的稀缺性
    技术人员如何去面试?
    13幅逻辑图,领略杜克大学的经典思维
  • 原文地址:https://www.cnblogs.com/feb9903/p/3430762.html
Copyright © 2011-2022 走看看