zoukankan      html  css  js  c++  java
  • 抓取小程序

        前言 ,想利用小程序导航页面来提升网站的流量,找到  www.xcxdh666.com  该小程序导航网站。

     分析网页

            1 发现网站其实也是用异步分页请求加载数据的  ,所以根本用不着xpath  解析html,直接分析其请求url

            2点击加载更多找到请求,发现其实就 pageNum ,cagegory 两个参数

           3所以直接请求url    带入参数,分析起返回json结果

      编写代码

             1 首先建立接收类型       

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    public class XcxApplet
       {
           public int id { getset; }
     
           public string categoryName { getset; }
     
           public string name { getset; }
     
           public string saomaUrl { getset; }
     
           public string sum { getset; }
     
           public string logoUrl { getset; }
       }
     
       public class Result
       {
           public List<XcxApplet> dataList { getset; }
           public string category { getset; }
           public int  status { getset; }
           public int pageNum { getset; }
       }

      

         2  封装请求页面方法

      

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    public static string GetPostPage(this string posturl, string postData)
             {
                 Encoding encoding = Encoding.UTF8;
                 byte[] data = null;
                 if (!string.IsNullOrEmpty(postData)) data = encoding.GetBytes(postData);
                 try
                 {
                     // 设置参数
                     var request = WebRequest.Create(posturl) as HttpWebRequest;
                     if (request == nullreturn string.Empty;
                     var cookieContainer = new CookieContainer();
                     request.CookieContainer = cookieContainer;
                     request.AllowAutoRedirect = true;
                     request.Method = "POST";
                     request.ContentType = "application/x-www-form-urlencoded";
                     if (data != null)
                     {
                         request.ContentLength = data.Length;
                         Stream outstream = request.GetRequestStream();
                         outstream.Write(data, 0, data.Length);
                         outstream.Close();
                     }
                     //发送请求并获取相应回应数据
                     var response = request.GetResponse() as HttpWebResponse;
                     if (response == nullreturn string.Empty;
     
                     //直到request.GetResponse()程序才开始向目标网页发送Post请求
                     Stream instream = response.GetResponseStream();
                     if (instream == nullreturn string.Empty;
                     var sr = new StreamReader(instream, encoding);
                     //返回结果网页(html)代码
                     string content = sr.ReadToEnd();
                     string err = string.Empty;
                     //Response.Write(content);
                     return content;
                 }
                 catch (Exception ex)
                 {
                     string err = ex.Message;
                     return string.Empty;
                 }
             }

      3  图片url处理   思路就是要将其返回的url 请求下载到本地或者上传到自己对应的图片服务器,

               我这里是用七牛云存储img的 ,这里你可以改成下载到本地 返回本地的url就好。

      

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    public string  QiniuUplod(string imgurl)
            {
                   
                var accessKey = "你的accesskey";
                var secretKey = "你的secretkey";
     
                // 生成(上传)凭证时需要使用此Mac
                // 这个示例单独使用了一个Settings类,其中包含AccessKey和SecretKey
                // 实际应用中,请自行设置您的AccessKey和SecretKey
                Mac mac = new Mac(accessKey, secretKey);
                string bucket = "siyouku";
                string saveKey = imgurl.Substring(imgurl.LastIndexOf('/')+1,imgurl.Length- imgurl.LastIndexOf('/')-1);
     
     
                // 使用前请确保AK和BUCKET正确,否则此函数会抛出异常(比如code612/631等错误)
                Qiniu.Common.Config.AutoZone(accessKey, bucket, false);
     
     
                // 上传策略,参见
                // https://developer.qiniu.com/kodo/manual/put-policy
                PutPolicy putPolicy = new PutPolicy();
                // 如果需要设置为"覆盖"上传(如果云端已有同名文件则覆盖),请使用 SCOPE = "BUCKET:KEY"
                 putPolicy.Scope = bucket + ":" + saveKey;
                putPolicy.Scope = bucket;
                // 上传策略有效期(对应于生成的凭证的有效期)         
                putPolicy.SetExpires(3600);
                // 上传到云端多少天后自动删除该文件,如果不设置(即保持默认默认)则不删除
                //putPolicy.DeleteAfterDays = 1;
     
                // 生成上传凭证,参见
                // https://developer.qiniu.com/kodo/manual/upload-token           
                string jstr = putPolicy.ToJsonString();
                string token = Auth.CreateUploadToken(mac, jstr);
                try
                {
                     
                    var wReq = System.Net.WebRequest.Create(imgurl) as System.Net.HttpWebRequest;
                    var resp = wReq.GetResponse() as System.Net.HttpWebResponse;
                    using (var stream = resp.GetResponseStream())
                    {
                        // 请不要使用UploadManager的UploadStream方法,因为此流不支持查找(无法获取Stream.Length)
                        // 请使用FormUploader或者ResumableUploader的UploadStream方法
                        FormUploader fu = new FormUploader();
                        var result = fu.UploadStream(stream, saveKey, token);
                        var x = Newtonsoft.Json.JsonConvert.DeserializeObject<QiniuResult>(result.Text);
                        return $"http://img.siyouku.cn/{x.key}";
                    }
                }
                catch (Exception ex)
                {
                    return "";
                }
     
      
            }

      

       4 最后是请求主体方法 

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    public ActionResult GetxcxList()
            {
                Stopwatch watch = new Stopwatch();//监控抓取耗时
                watch.Start();
                //https://www.xcxdh666.com/pageList.htm?pageNum=0  dataList
                var result = new Result();
     
                for (int j = 0; j <54; j++)
                {
                    string url =
                        $"https://www.xcxdh666.com/pageList.htm?pageNum={j}";
     
                    var str = url.GetPostPage(null);//HttpWebRequest 请求页面
                    if (str != null)
                    {
                        result = str.JsonConvert<Result>();  //string   的序列化扩展方法
                    }
     
                    result.dataList.ForEach(i =>
                    {
                        if (!Db.Applet.Any(x => x.Name == i.name))//判断重复插入
                        {
                            var x = new Applet()
                            {
                                CategoryName = string.IsNullOrEmpty(i.categoryName) ? "其它" : i.categoryName,
                                Name = i.name,
                                SaomiaoUrl = QiniuUplod($"http://img.xcxdh666.com/wxappnav/{i.saomaUrl}"),
                                Summary = i.sum,
                                LogoUrl = QiniuUplod($"http://img.xcxdh666.com/wxappnav/{i.logoUrl}"),
                                SortNum = j,
                                CreateUser = "wenqing",
                                CreateTime = DateTime.Now
     
                            };
                            Db.Applet.Add(x);
                        }
     
                    });
     
                    Db.SaveChanges();
     
     
                }
                watch.Stop();
                return Content("爬取完成!本次请求总共耗时:"+ watch.ElapsedMilliseconds);
            }
        }

      

    ok  到这里就全部抓取完成

             这里附上 展示地址  http://siyouku.cn/Applet

  • 相关阅读:
    关于read函数的一些分析
    条件变量
    epoll的边缘触发与水平触发
    内核态的接收缓冲区和发送缓冲区
    SourceTreet提交时显示remote: Incorrect username or password ( access token )(4种解决办法)
    前端技术汇总+Vue最新快速上手
    MyBatisPlus性能分析插件,条件构造器,代码自动生成器详解
    博客园怎样在Markdown编辑模式下调整图片大小(已解决)
    MyBatisPlus分页查询,删除操作
    idea括号选中时出现一条下滑线(突出显示)打开或关闭方法
  • 原文地址:https://www.cnblogs.com/zzp0320/p/7878701.html
Copyright © 2011-2022 走看看