zoukankan      html  css  js  c++  java
  • C# 关于爬取网站数据遇到csrf-token的分析与解决

    需求

    某航空公司物流单信息查询,是一个post请求。通过后台模拟POST HTTP请求发现无法获取页面数据,通过查看航空公司网站后,发现网站使用避免CSRF攻击机制,直接发挥40X错误。

    关于CSRF

    读者自行百度

    网站HTTP请求分析 

    Headers

    Form Data

    在head里包含了cookie 与 x-csrf-token  formdata 里包含了_csrf (与head里的值是一样的).

    这里通过查看该网站的JS源代码发现_csrf 来自于网页的head标签里

    猜测cookie与 x-csrf-token是有一定的有效期,并且他们共同作用来防御CSRF攻击。

    解决方案

    1,首先请求一下该航空公司的网站,获取cookie与_csrf

    2,然后C# 模拟http分别在head和formdata里加入如上参数,发起请求

     代码

     public class CSRFToken
        {
            string cookie;//用于请求的站点的cookie
            List<string> csrfs;//用于请求站点的token的key 以及 value
    
            public CSRFToken(string url)
            {
                //校验传输安全
                if (!string.IsNullOrWhiteSpace(url))
                {
                    try
                    {
                        //设置请求的头信息.获取url的host
                        var _http = new HttpHelper(url);
                        string cookie;
                        string html = _http.CreateGetHttpResponseForPC(out cookie);
                        this.cookie = cookie;
    
                        string headRegex = @"<meta name=""_csrf.*"" content="".*""/>";
    
                        MatchCollection matches = Regex.Matches(html, headRegex);
                        Regex re = new Regex("(?<=content=").*?(?=")", RegexOptions.None);
                        csrfs = new List<string>();
                        foreach (Match math in matches)
                        {
    
                            MatchCollection mc = re.Matches(math.Value);
                            foreach (Match ma in mc)
                            {
                                csrfs.Add(ma.Value);
                            }
                        }
    
                    }
                    catch (Exception e)
                    {
    
                    }
                }
            }
    
            public String getCookie()
            {
                return cookie;
            }
            public void setCookie(String cookie)
            {
                this.cookie = cookie;
            }
            public List<string> getCsrf_token()
            {
                return csrfs;
            }
        }

    httpHelper

      public string CreatePostHttpResponse(IDictionary<string, string> headers, IDictionary<string, string> parameters)
            {
                HttpWebRequest request = null;
                //HTTPSQ请求  
                UTF8Encoding encoding = new System.Text.UTF8Encoding();
                ServicePointManager.ServerCertificateValidationCallback = new RemoteCertificateValidationCallback(CheckValidationResult);
                request = WebRequest.Create(_baseIPAddress) as HttpWebRequest;
                request.ProtocolVersion = HttpVersion.Version10;
                ServicePointManager.SecurityProtocol = SecurityProtocolType.Tls12 | SecurityProtocolType.Tls11;
                request.Method = "POST";
                request.ContentType = "application/x-www-form-urlencoded";
                // request.ContentType = "application/json";
                request.UserAgent = DefaultUserAgent;
                //request.Headers.Add("X-CSRF-TOKEN", "bc0cc533-60cc-484a-952d-0b4c1a95672c");
                //request.Referer = "https://www.asianacargo.com/tracking/viewTraceAirWaybill.do";
    
                //request.Headers.Add("Origin", "https://www.asianacargo.com");
                //request.Headers.Add("Cookie", "JSESSIONID=HP21d2Dq5FoSlG4Fyw4slWwHb0-Sl1CG6jGtj7HE41e5f4aN_R1p!-435435446!117330181");
                //request.Host = "www.asianacargo.com";
    
    
                if (!(headers == null || headers.Count == 0))
                {
    
                    foreach (string key in headers.Keys)
                    {
                        request.Headers.Add(key, headers[key]);
                    }
    
                }
    
    
                //如果需要POST数据     
                if (!(parameters == null || parameters.Count == 0))
                {
                    StringBuilder buffer = new StringBuilder();
                    int i = 0;
                    foreach (string key in parameters.Keys)
                    {
                        if (i > 0)
                        {
                            buffer.AppendFormat("&{0}={1}", key, parameters[key]);
                        }
                        else
                        {
                            buffer.AppendFormat("{0}={1}", key, parameters[key]);
                        }
                        i++;
                    }
                    byte[] data = encoding.GetBytes(buffer.ToString());
                    using (Stream stream = request.GetRequestStream())
                    {
                        stream.Write(data, 0, data.Length);
                    }
                }
    
                HttpWebResponse response;
    
                try
                {
                    //获得响应流
                    response = (HttpWebResponse)request.GetResponse();
                    Stream s = response.GetResponseStream();
    
                    StreamReader readStream = new StreamReader(s, Encoding.UTF8);
                    string SourceCode = readStream.ReadToEnd();
                    response.Close();
                    readStream.Close();
                    return SourceCode;
                }
                catch (WebException ex)
                {
                    response = ex.Response as HttpWebResponse; return null;
                }
    
            }
    
       public string CreateGetHttpResponse(out string cookie)
            {
                HttpWebRequest request = null;
                //HTTPSQ请求  
                UTF8Encoding encoding = new System.Text.UTF8Encoding();
                ServicePointManager.ServerCertificateValidationCallback = new RemoteCertificateValidationCallback(CheckValidationResult);
                request = WebRequest.Create(_baseIPAddress) as HttpWebRequest;
                request.ProtocolVersion = HttpVersion.Version10;
                ServicePointManager.SecurityProtocol = SecurityProtocolType.Tls12 | SecurityProtocolType.Tls11;
                request.Method = "GET";
                request.ContentType = "application/x-www-form-urlencoded";
                request.UserAgent = DefaultUserAgent;
    
                HttpWebResponse response;
    
                try
                {
                    //获得响应流
                    response = (HttpWebResponse)request.GetResponse();
    
                    cookie = response.Headers["Set-Cookie"];
                    Stream s = response.GetResponseStream();
    
                    StreamReader readStream = new StreamReader(s, Encoding.UTF8);
                    string SourceCode = readStream.ReadToEnd();
                    response.Close();
                    readStream.Close();
                    return SourceCode;
                }
                catch (WebException ex)
                {
                    response = ex.Response as HttpWebResponse;
                    cookie = "";
                    return null;
                }
    
            }

    爬取程序

    爬取结果

    浏览器结果

    注意事项与结论

    1,不同的网站,获取cstf的方式不一样,无论怎么做,只要信息传到前台我们都可以有相应的方法来获取。

    2,请求时候的http验证可能不一样,测试的某航空公司物流信息的时候,http请求的安全协议是tis12。

     ServicePointManager.SecurityProtocol = SecurityProtocolType.Tls12 | SecurityProtocolType.Tls11; 还有其他参数比如UserAgent后台可能也会验证

    3,基于如上航空公司,发现它的cookie和cstf_token一定时间内不会改变,那么当实际爬取的时候可以考虑缓存cookie以及cstf_token,只有当请求失败的时候,才重新获取

  • 相关阅读:
    【leetcode❤python】 374. Guess Number Higher or Lower
    【leetcode❤python】 8. String to Integer (atoi)
    【leetcode❤python】 438. Find All Anagrams in a String
    【leetcode❤python】 88. Merge Sorted Array
    【leetcode❤python】 225. Implement Stack using Queues
    【leetcode❤python】 58. Length of Last Word
    463:归档和传输文件
    438:管理网络
    365:查看系统日志条目
    350:描述系统日志架构
  • 原文地址:https://www.cnblogs.com/yibey/p/10870472.html
Copyright © 2011-2022 走看看