zoukankan      html  css  js  c++  java
  • 网站抓取

    新软件马上就要完成了,先发篇文章YY下

     

    最近一直都在搞网站抓取方面的开发,闲着无聊逛逛论坛,发现有些帖子还是写的相当不错的,只是一篇一篇的点进去比较麻烦,于是就写了个小软件只是为了方便查看博客园和CSDN上的优秀文章。其实这个还可以拓展的,比如说可以添加RSS功能,查看新闻网站的新闻。代码比较简单,可以考虑用个工厂模式。

    写的比较乱,都不敢上代码了。求大神喷!

    2013-6-28号更新

    1、添加了皮肤

    2013-6-29号更新

    1、解决了ListView控件添加数据闪烁问题。

    2、取消皮肤加快数据加载速度

    3、优化了浏览文章体验

    点击下载

     里面有几个类库非常不错,想要的可以拿去。

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    125
    126
    127
    128
    129
    130
    131
    132
    133
    134
    135
    136
    137
    138
    139
    140
    141
    142
    143
    144
    145
    146
    147
    148
    149
    150
    151
    152
    153
    154
    155
    156
    157
    158
    159
    160
    161
    162
    163
    164
    165
    166
    167
    168
    169
    170
    171
    172
    173
    174
    175
    176
    177
    178
    179
    180
    181
    182
    183
    184
    185
    186
    187
    188
    189
    190
    191
    192
    193
    194
    195
    196
    197
    198
    199
    200
    201
    202
    203
    204
    205
    206
    207
    208
    209
    210
    211
    212
    213
    214
    215
    216
    217
    218
    219
    220
    221
    222
    223
    224
    225
    226
    227
    228
    229
    230
    231
    232
    233
    234
    235
    236
    237
    238
    239
    240
    241
    242
    243
    244
    245
    246
    247
    248
    249
    250
    251
    252
    253
    254
    255
    256
    257
    258
    259
    260
    261
    262
    263
    264
    265
    266
    267
    268
    269
    270
    271
    272
    273
    274
    275
    276
    277
    278
    279
    280
    281
    282
    283
    284
    285
    286
    287
    288
    289
    290
    291
    292
    293
    294
    295
    296
    297
    298
    299
    300
    301
    302
    303
    304
    305
    306
    307
    308
    309
    310
    311
    312
    313
    314
    315
    316
    317
    318
    319
    320
    321
    322
    323
    324
    325
    326
    327
    328
    329
    330
    331
    332
    333
    334
    335
    336
    337
    338
    339
    340
    341
    342
    343
    344
    345
    346
    347
    348
    349
    350
    351
    352
    353
    354
    355
    356
    357
    358
    359
    360
    361
    362
    363
    364
    365
    366
    367
    368
    369
    370
    371
    372
    373
    374
    375
    376
    377
    378
    379
    380
    381
    382
    383
    384
    385
    386
    387
    388
    389
    390
    391
    392
    393
    394
    395
    396
    397
    398
    399
    400
    401
    402
    403
    404
    405
    406
    407
    408
    409
    410
    411
    412
    413
    414
    415
    416
    417
    418
    419
    420
    421
    422
    423
    424
    425
    426
    427
    428
    429
    430
    431
    432
    433
    434
    435
    436
    437
    438
    439
    440
    441
    442
    443
    444
    445
    446
    447
    448
    449
    450
    451
    452
    453
    454
    455
    456
    457
    458
    459
    460
    461
    462
    463
    464
    465
    466
    467
    468
    469
    470
    471
    472
    473
    474
    475
    476
    477
    478
    479
    480
    481
    482
    483
    484
    485
    486
    487
    488
    489
    490
    491
    492
    493
    494
    495
    496
    497
    498
    499
    500
    501
    502
    503
    504
    505
    506
    507
    508
    509
    510
    511
    512
    513
    514
    515
    516
    517
    518
    519
    520
    521
    522
    523
    524
    525
    526
    527
    528
    529
    530
    531
    532
    533
    534
    535
    536
    537
    538
    539
    540
    541
    542
    543
    544
    545
    546
    547
    548
    549
    550
    551
    552
    553
    554
    555
    556
    557
    558
    559
    560
    561
    562
    563
    564
    565
    566
    567
    568
    569
    570
    571
    572
    573
    574
    575
    576
    577
    578
    579
    580
    581
    582
    583
    584
    585
    586
    587
    588
    589
    590
    591
    592
    593
    594
    595
    596
    597
    598
    599
    600
    601
    602
    603
    604
    605
    606
    607
    608
    609
    ///
    /// 类说明:HttpHelps类,用来实现Http访问,Post或者Get方式的,直接访问,带Cookie的,带证书的等方式,可以设置代理
    /// 重要提示:请不要自行修改本类,如果因为你自己修改后将无法升级到新版本。如果确实有什么问题请到官方网站提建议,
    /// 我们一定会及时修改
    /// 编码日期:2011-09-20
    /// 编 码 人:苏飞
    /// 联系方式:361983679 
    /// 修改日期:2013-04-14
    ///
    using System;
    using System.Collections.Generic;
    using System.Text;
    using System.Net;
    using System.IO;
    using System.Text.RegularExpressions;
    using System.IO.Compression;
    using System.Security.Cryptography.X509Certificates;
    using System.Net.Security;
     
    namespace Common.PageHelper
    {
        ///
        /// Http连接操作帮助类
        ///
        public class HttpHelper
        {
            #region 预定义方法或者变更
            //默认的编码
            private Encoding encoding = Encoding.Default;
            //HttpWebRequest对象用来发起请求
            private HttpWebRequest request = null;
            //获取影响流的数据对象
            private HttpWebResponse response = null;
            ///
            /// 根据相传入的数据,得到相应页面数据
            ///
            ///参数类对象
            ///返回HttpResult类型
            private HttpResult GetHttpRequestData(HttpItem objhttpitem)
            {
                //返回参数
                HttpResult result = new HttpResult();
                try
                {
                    #region 得到请求的response
                    using (response = (HttpWebResponse)request.GetResponse())
                    {
                        result.StatusCode = response.StatusCode;
                        result.StatusDescription = response.StatusDescription;
                        result.Header = response.Headers;
                        if (response.Cookies != null)
                            result.CookieCollection = response.Cookies;
                        if (response.Headers["set-cookie"] != null)
                            result.Cookie = response.Headers["set-cookie"];
                        MemoryStream _stream = new MemoryStream();
                        //GZIIP处理
                        if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
                        {
                            //开始读取流并设置编码方式
                            //new GZipStream(response.GetResponseStream(), CompressionMode.Decompress).CopyTo(_stream, 10240);
                            //.net4.0以下写法
                            _stream = GetMemoryStream(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress));
                        }
                        else
                        {
                            //开始读取流并设置编码方式
                            //response.GetResponseStream().CopyTo(_stream, 10240);
                            //.net4.0以下写法
                            _stream = GetMemoryStream(response.GetResponseStream());
                        }
                        //获取Byte
                        byte[] RawResponse = _stream.ToArray();
                        _stream.Close();
                        //是否返回Byte类型数据
                        if (objhttpitem.ResultType == ResultType.Byte)
                            result.ResultByte = RawResponse;
                        //从这里开始我们要无视编码了
                        if (encoding == null)
                        {
                            Match meta = Regex.Match(Encoding.Default.GetString(RawResponse), "<meta([^<]*)charset=([^<]*)["']", RegexOptions.IgnoreCase);
                            string charter = (meta.Groups.Count > 2) ? meta.Groups[2].Value.ToLower() : string.Empty;
                            charter = charter.Replace(""", "").Replace("'", "").Replace(";", "").Replace("iso-8859-1", "gbk");
                            if (charter.Length > 2)
                                encoding = Encoding.GetEncoding(charter);
                            else
                            {
                                if (string.IsNullOrEmpty(response.CharacterSet))
                                    encoding = Encoding.UTF8;
                                else
                                    encoding = Encoding.GetEncoding(response.CharacterSet);
                            }
                        }
                        //得到返回的HTML
                        result.Html = encoding.GetString(RawResponse);
                    }
                    #endregion
                }
                catch (WebException ex)
                {
                    //这里是在发生异常时返回的错误信息
                    response = (HttpWebResponse)ex.Response;
                    result.Html = ex.Message;
                    result.StatusCode = response.StatusCode;
                    result.StatusDescription = response.StatusDescription;
                }
                catch (Exception ex)
                {
                    result.Html = ex.Message;
                }
                if (objhttpitem.IsToLower)
                    result.Html = result.Html.ToLower();
     
                return result;
            }
            ///
            /// 4.0以下.net版本取数据使用
            ///
            ///流
            private static MemoryStream GetMemoryStream(Stream streamResponse)
            {
                MemoryStream _stream = new MemoryStream();
                int Length = 256;
                Byte[] buffer = new Byte[Length];
                int bytesRead = streamResponse.Read(buffer, 0, Length);
                // write the required bytes 
                while (bytesRead > 0)
                {
                    _stream.Write(buffer, 0, bytesRead);
                    bytesRead = streamResponse.Read(buffer, 0, Length);
                }
                return _stream;
            }
            ///
            /// 为请求准备参数
            ///
            ///参数列表
            ///读取数据时的编码方式
            private void SetRequest(HttpItem objhttpItem)
            {
                // 验证证书
                SetCer(objhttpItem);
                //设置Header参数
                if (objhttpItem.Header != null && objhttpItem.Header.Count > 0)
                {
                    foreach (string item in objhttpItem.Header.AllKeys)
                    {
                        request.Headers.Add(item, objhttpItem.Header[item]);
                    }
                }
                // 设置代理
                SetProxy(objhttpItem);
                //请求方式Get或者Post
                request.Method = objhttpItem.Method;
                request.Timeout = objhttpItem.Timeout;
                request.ReadWriteTimeout = objhttpItem.ReadWriteTimeout;
                //Accept
                request.Accept = objhttpItem.Accept;
                //ContentType返回类型
                request.ContentType = objhttpItem.ContentType;
                //UserAgent客户端的访问类型,包括浏览器版本和操作系统信息
                request.UserAgent = objhttpItem.UserAgent;
                // 编码
                encoding = objhttpItem.Encoding;
                //设置Cookie
                SetCookie(objhttpItem);
                //来源地址
                request.Referer = objhttpItem.Referer;
                //是否执行跳转功能
                request.AllowAutoRedirect = objhttpItem.Allowautoredirect;
                //设置Post数据
                SetPostData(objhttpItem);
                //设置最大连接
                if (objhttpItem.Connectionlimit > 0)
                    request.ServicePoint.ConnectionLimit = objhttpItem.Connectionlimit;
            }
            ///
            /// 设置证书
            ///
            ///
            private void SetCer(HttpItem objhttpItem)
            {
                if (!string.IsNullOrEmpty(objhttpItem.CerPath))
                {
                    //这一句一定要写在创建连接的前面。使用回调的方法进行证书验证。
                    ServicePointManager.ServerCertificateValidationCallback = new System.Net.Security.RemoteCertificateValidationCallback(CheckValidationResult);
                    //初始化对像,并设置请求的URL地址
                    request = (HttpWebRequest)WebRequest.Create(objhttpItem.URL);
                    //将证书添加到请求里
                    request.ClientCertificates.Add(new X509Certificate(objhttpItem.CerPath));
                }
                else
                    //初始化对像,并设置请求的URL地址
                    request = (HttpWebRequest)WebRequest.Create(objhttpItem.URL);
            }
            ///
            /// 设置Cookie
            ///
            ///Http参数
            private void SetCookie(HttpItem objhttpItem)
            {
                if (!string.IsNullOrEmpty(objhttpItem.Cookie))
                    //Cookie
                    request.Headers[HttpRequestHeader.Cookie] = objhttpItem.Cookie;
                //设置Cookie
                if (objhttpItem.CookieCollection != null)
                {
                    request.CookieContainer = new CookieContainer();
                    request.CookieContainer.Add(objhttpItem.CookieCollection);
                }
            }
            ///
            /// 设置Post数据
            ///
            ///Http参数
            private void SetPostData(HttpItem objhttpItem)
            {
                //验证在得到结果时是否有传入数据
                if (request.Method.Trim().ToLower().Contains("post"))
                {
                    byte[] buffer = null;
                    //写入Byte类型
                    if (objhttpItem.PostDataType == PostDataType.Byte && objhttpItem.PostdataByte != null && objhttpItem.PostdataByte.Length > 0)
                    {
                        //验证在得到结果时是否有传入数据
                        buffer = objhttpItem.PostdataByte;
                    }//写入文件
                    else if (objhttpItem.PostDataType == PostDataType.FilePath && !string.IsNullOrEmpty(objhttpItem.Postdata))
                    {
                        StreamReader r = new StreamReader(objhttpItem.Postdata, encoding);
                        buffer = Encoding.Default.GetBytes(r.ReadToEnd());
                        r.Close();
                    } //写入字符串
                    else if (!string.IsNullOrEmpty(objhttpItem.Postdata))
                    {
                        buffer = Encoding.Default.GetBytes(objhttpItem.Postdata);
                    }
                    if (buffer != null)
                    {
                        request.ContentLength = buffer.Length;
                        request.GetRequestStream().Write(buffer, 0, buffer.Length);
                    }
                }
            }
            ///
            /// 设置代理
            ///
            ///参数对象
            private void SetProxy(HttpItem objhttpItem)
            {
                if (!string.IsNullOrEmpty(objhttpItem.ProxyIp))
                {
                    //设置代理服务器
                    WebProxy myProxy = new WebProxy(objhttpItem.ProxyIp, false);
                    //建议连接
                    myProxy.Credentials = new NetworkCredential(objhttpItem.ProxyUserName, objhttpItem.ProxyPwd);
                    //给当前请求对象
                    request.Proxy = myProxy;
                    //设置安全凭证
                    request.Credentials = CredentialCache.DefaultNetworkCredentials;
                }
            }
            ///
            /// 回调验证证书问题
            ///
            ///流对象
            ///证书
            ///X509Chain
            ///SslPolicyErrors
            ///bool
            public bool CheckValidationResult(object sender, X509Certificate certificate, X509Chain chain, SslPolicyErrors errors)
            {
                // 总是接受   
                return true;
            }
            #endregion
            #region 普通类型
            ///
            ///采用https协议访问网络,根据传入的URl地址,得到响应的数据字符串。
            ///
            ///参数列表
            ///String类型的数据
            public HttpResult GetHtml(HttpItem objhttpItem)
            {
                try
                {
                    //准备参数
                    SetRequest(objhttpItem);
                }
                catch (Exception ex)
                {
                    return new HttpResult() { Cookie = "", Header = null, Html = ex.Message, StatusDescription = "配置参考时报错"};
                }
                //调用专门读取数据的类
                return GetHttpRequestData(objhttpItem);
            }
            #endregion
        }
        ///
        /// Http请求参考类
        ///
        public class HttpItem
        {
            string _URL = string.Empty;
            ///
            /// 请求URL必须填写
            ///
            public string URL
            {
                get { return _URL; }
                set { _URL = value; }
            }
            string _Method = "GET";
            ///
            /// 请求方式默认为GET方式,当为POST方式时必须设置Postdata的值
            ///
            public string Method
            {
                get { return _Method; }
                set { _Method = value; }
            }
            int _Timeout = 100000;
            ///
            /// 默认请求超时时间
            ///
            public int Timeout
            {
                get { return _Timeout; }
                set { _Timeout = value; }
            }
            int _ReadWriteTimeout = 30000;
            ///
            /// 默认写入Post数据超时间
            ///
            public int ReadWriteTimeout
            {
                get { return _ReadWriteTimeout; }
                set { _ReadWriteTimeout = value; }
            }
            string _Accept = "text/html, application/xhtml+xml, */*";
            ///
            /// 请求标头值 默认为text/html, application/xhtml+xml, */*
            ///
            public string Accept
            {
                get { return _Accept; }
                set { _Accept = value; }
            }
            string _ContentType = "text/html";
            ///
            /// 请求返回类型默认 text/html
            ///
            public string ContentType
            {
                get { return _ContentType; }
                set { _ContentType = value; }
            }
            string _UserAgent = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)";
            ///
            /// 客户端访问信息默认Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)
            ///
            public string UserAgent
            {
                get { return _UserAgent; }
                set { _UserAgent = value; }
            }
            Encoding _Encoding = null;
            ///
            /// 返回数据编码默认为NUll,可以自动识别,一般为utf-8,gbk,gb2312
            ///
            public Encoding Encoding
            {
                get { return _Encoding; }
                set { _Encoding = value; }
            }
            private PostDataType _PostDataType = PostDataType.String;
            ///
            /// Post的数据类型
            ///
            public PostDataType PostDataType
            {
                get { return _PostDataType; }
                set { _PostDataType = value; }
            }
            string _Postdata = string.Empty;
            ///
            /// Post请求时要发送的字符串Post数据
            ///
            public string Postdata
            {
                get { return _Postdata; }
                set { _Postdata = value; }
            }
            private byte[] _PostdataByte = null;
            ///
            /// Post请求时要发送的Byte类型的Post数据
            ///
            public byte[] PostdataByte
            {
                get { return _PostdataByte; }
                set { _PostdataByte = value; }
            }
            CookieCollection cookiecollection = null;
            ///
            /// Cookie对象集合
            ///
            public CookieCollection CookieCollection
            {
                get { return cookiecollection; }
                set { cookiecollection = value; }
            }
            string _Cookie = string.Empty;
            ///
            /// 请求时的Cookie
            ///
            public string Cookie
            {
                get { return _Cookie; }
                set { _Cookie = value; }
            }
            string _Referer = string.Empty;
            ///
            /// 来源地址,上次访问地址
            ///
            public string Referer
            {
                get { return _Referer; }
                set { _Referer = value; }
            }
            string _CerPath = string.Empty;
            ///
            /// 证书绝对路径
            ///
            public string CerPath
            {
                get { return _CerPath; }
                set { _CerPath = value; }
            }
            private Boolean isToLower = false;
            ///
            /// 是否设置为全文小写,默认为不转化
            ///
            public Boolean IsToLower
            {
                get { return isToLower; }
                set { isToLower = value; }
            }
            private Boolean allowautoredirect = false;
            ///
            /// 支持跳转页面,查询结果将是跳转后的页面,默认是不跳转
            ///
            public Boolean Allowautoredirect
            {
                get { return allowautoredirect; }
                set { allowautoredirect = value; }
            }
            private int connectionlimit = 1024;
            ///
            /// 最大连接数
            ///
            public int Connectionlimit
            {
                get { return connectionlimit; }
                set { connectionlimit = value; }
            }
            private string proxyusername = string.Empty;
            ///
            /// 代理Proxy 服务器用户名
            ///
            public string ProxyUserName
            {
                get { return proxyusername; }
                set { proxyusername = value; }
            }
            private string proxypwd = string.Empty;
            ///
            /// 代理 服务器密码
            ///
            public string ProxyPwd
            {
                get { return proxypwd; }
                set { proxypwd = value; }
            }
            private string proxyip = string.Empty;
            ///
            /// 代理 服务IP
            ///
            public string ProxyIp
            {
                get { return proxyip; }
                set { proxyip = value; }
            }
            private ResultType resulttype = ResultType.String;
            ///
            /// 设置返回类型String和Byte
            ///
            public ResultType ResultType
            {
                get { return resulttype; }
                set { resulttype = value; }
            }
            private WebHeaderCollection header = new WebHeaderCollection();
            //header对象
            public WebHeaderCollection Header
            {
                get { return header; }
                set { header = value; }
            }
        }
        ///
        /// Http返回参数类
        ///
        public class HttpResult
        {
            string _Cookie = string.Empty;
            ///
            /// Http请求返回的Cookie
            ///
            public string Cookie
            {
                get { return _Cookie; }
                set { _Cookie = value; }
            }
            CookieCollection cookiecollection = new CookieCollection();
            ///
            /// Cookie对象集合
            ///
            public CookieCollection CookieCollection
            {
                get { return cookiecollection; }
                set { cookiecollection = value; }
            }
            private string html = string.Empty;
            ///
            /// 返回的String类型数据 只有ResultType.String时才返回数据,其它情况为空
            ///
            public string Html
            {
                get { return html; }
                set { html = value; }
            }
            private byte[] resultbyte = null;
            ///
            /// 返回的Byte数组 只有ResultType.Byte时才返回数据,其它情况为空
            ///
            public byte[] ResultByte
            {
                get { return resultbyte; }
                set { resultbyte = value; }
            }
            private WebHeaderCollection header = new WebHeaderCollection();
            //header对象
            public WebHeaderCollection Header
            {
                get { return header; }
                set { header = value; }
            }
            private string statusDescription = "";
            ///
            /// 返回状态说明
            ///
            public string StatusDescription
            {
                get { return statusDescription; }
                set { statusDescription = value; }
            }
            private HttpStatusCode statusCode = HttpStatusCode.OK;
            ///
            /// 返回状态码,默认为OK
            ///
            public HttpStatusCode StatusCode
            {
                get { return statusCode; }
                set { statusCode = value; }
            }
        }
        ///
        /// 返回类型
        ///
        public enum ResultType
        {
            ///
            /// 表示只返回字符串 只有Html有数据
            ///
            String,
            ///
            /// 表示返回字符串和字节流 ResultByte和Html都有数据返回
            ///
            Byte
        }
        ///
        /// Post的数据格式默认为string
        ///
        public enum PostDataType
        {
            ///
            /// 字符串类型,这时编码Encoding可不设置
            ///
            String,
            ///
            /// Byte类型,需要设置PostdataByte参数的值编码Encoding可设置为空
            ///
            Byte,
            ///
            /// 传文件,Postdata必须设置为文件的绝对路径,必须设置Encoding的值
            ///
            FilePath
        }
    }
     
     
    分类: .net代码
  • 相关阅读:
    Jmeter入门(二、元件和组件)
    Jmeter入门(一)
    loadrunner (三、脚本执行&结果分析)
    loadrunner(二、创建脚本)
    Centos常用命令(九、shell编程-综合案例)
    Centos常用命令(八、shell编程-函数)
    利用python实现动态数组
    为什么说 Mybatis 是半自动 ORM 映射工具?它与全自动的区别在哪里
    #{}和${}的区别是什么
    Mybatis 动态 sql 是做什么的?都有哪些动态 sql?能简述一下动态 sql 的执行原理不
  • 原文地址:https://www.cnblogs.com/Leo_wl/p/3163079.html
Copyright © 2011-2022 走看看