zoukankan      html  css  js  c++  java
  • 第一篇 C#模拟http请求抓取数据

    很多人用python去抓数据,语法简单又方便。我这里用C#去抓数据,python后期自学下,看下能写出一些好文吗?

    上一篇 asp.net webService添加头文件验证 

    第一步 先把应用程序发布iis服务器上

    第二步 用谷歌浏览器抓下网络请求

    观察头文件分析后 确认是post提交的。

    请求一般看这些参数 status code 200 说明是正常的没有做跳转的包,长度是根据表单提交的参数计算出来了,如果发现计算的长度和抓取的包的长度不对 有可能是传递表单参数传入有问题 需要排查,

    还有content-type 这个有可能会是application/json类型,传递参数那就是json类型了。

    2.1 请求

    2.2 响应

    注意红框就是我想要拿到响应的数据,但是多了很多无用的html标签,这些无用的标签需要替换

    参考文献:

    正则表达式,去除所有HTML标签 https://www.cnblogs.com/caok168/articles/2567117.html

    第三步 编写相关代码

      1 using MyAutomaticRefund.Common;
      2 using Newtonsoft.Json;
      3 using System;
      4 using System.Collections.Generic;
      5 using System.ComponentModel;
      6 using System.Data;
      7 using System.Drawing;
      8 using System.IO;
      9 using System.Linq;
     10 using System.Net;
     11 using System.Text;
     12 using System.Text.RegularExpressions;
     13 using System.Threading.Tasks;
     14 using System.Windows.Forms;
     15 
     16 namespace MyAutomaticRefund
     17 {
     18     public partial class Form1 : Form
     19     {
     20         static CookieContainer m_Cookie = null;
     21 
     22         public Form1()
     23         {
     24             InitializeComponent();
     25         }
     26 
     27         private void button1_Click(object sender, EventArgs e)
     28         {
     29             string result = CaptureData();
     30 
     31             string regexstr = @"<[^>]*>";    //去除所有的标签
     32             result = Regex.Replace(result, regexstr, string.Empty, RegexOptions.IgnoreCase);
     33             MessageBox.Show(result.Trim());
     34         }
     35 
     36         /// <summary>
     37         /// 抓取按钮事件
     38         /// </summary>
     39         /// <returns></returns>
     40         private string CaptureData()
     41         {
     42 
     43             string result = "";
     44             try
     45             {
     46                 string Url = "http://192.168.21.195:8044/WebForm1";
     47                 HttpWebRequest request = WebRequest.Create(Url) as HttpWebRequest;
     48                 request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3";
     49                 request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36";
     50                 request.Method = "POST";
     51                 request.KeepAlive = true;
     52                 request.Referer = "http://192.168.21.195:8044/WebForm1";
     53                 request.ContentType = "application/x-www-form-urlencoded";
     54                 request.Headers.Add("Accept-Encoding", "gzip, deflate");
     55                 request.Headers.Add("Accept-Language", "zh-CN,zh;q=0.9");
     56                 //request.Headers.Add("X-Requested-With", "XMLHttpRequest");
     57                 request.CookieContainer = m_Cookie;
     58                 request.Host = "192.168.21.195:8044";
     59 
     60                 HttpRequestClient s = new HttpRequestClient(true);
     61 
     62 
     63                 Dictionary<string, string> dic = new Dictionary<string, string>();
     64                 dic.Add("__VIEWSTATE", "BPyDUNYWaU0XKZc1m/+hb4U4rlITcqsuUMRxGp8elUR/ZgK5owxYH030QThFNas5u/roAT5kBYSSMaMYwRFEcw1COVEwFNapeyfnLq2PZ98=");
     65                 dic.Add("__VIEWSTATEGENERATOR","B6E7D48B");
     66                 dic.Add("__EVENTVALIDATION", "3d+F/uurjOe90j09vxq1PU5Tli3EVssuM/HOISX5tyLUP2HdR956+2PC8ezRt2kDXMr1BDkstuMTV11I0laFwL+adHjgxQYtfEObtF/NligPunXkuGz4LuJ6VYdtACGg");
     67                 dic.Add("Button1", "Button");
     68 
     69                 string formData = ConvertStrDic(dic);
     70                 //string formData = "__VIEWSTATE=BPyDUNYWaU0XKZc1m%2F%2Bhb4U4rlITcqsuUMRxGp8elUR%2FZgK5owxYH030QThFNas5u%2FroAT5kBYSSMaMYwRFEcw1COVEwFNapeyfnLq2PZ98%3D&__VIEWSTATEGENERATOR=B6E7D48B&__EVENTVALIDATION=3d%2BF%2FuurjOe90j09vxq1PU5Tli3EVssuM%2FHOISX5tyLUP2HdR956%2B2PC8ezRt2kDXMr1BDkstuMTV11I0laFwL%2BadHjgxQYtfEObtF%2FNligPunXkuGz4LuJ6VYdtACGg&Button1=Button";
     71                 byte[] data = Encoding.UTF8.GetBytes(formData);
     72 
     73                 request.ContentLength = formData.Length;
     74                 using (Stream reqStream = request.GetRequestStream())
     75                 {
     76                     reqStream.Write(data, 0, data.Length);
     77                     reqStream.Close();
     78                 }
     79 
     80                 //响应  
     81                 HttpWebResponse response = (HttpWebResponse)request.GetResponse();
     82                 int statusInt = response.StatusCode.GetHashCode();
     83                 //响应成功  
     84                 if (response.StatusCode.ToString().ToLower() == "ok")
     85                 {
     86                     result = s.getResponseBody(response);
     87                     return result;
     88                 }
     89                 return result;
     90             }
     91             catch (Exception ex)
     92             {
     93                 return result;
     94             }
     95 
     96         }
     97 
     98         string ConvertStrDic(Dictionary<string, string> dic)
     99         {
    100             StringBuilder formdata = new StringBuilder();
    101             foreach (KeyValuePair<string, string> item in dic)
    102             {
    103                 formdata.Append(item.Key).Append("=").Append(System.Web.HttpUtility.UrlEncode(item.Value)).Append("&");//注意传递的参数需要编码
    104             }
    105             return formdata.ToString().TrimEnd('&');
    106         }
    107 
    108 
    109         public class formTable
    110         {
    111             public string __VIEWSTATE { get; set; }
    112             public string __VIEWSTATEGENERATOR { get; set; }
    113             public string __EVENTVALIDATION { get; set; }
    114             public string Button1 { get; set; }
    115         }
    116     }
    117 }

    运行结果

    用python语言 怎么写

    这里我用到的库是 urllib

    用到了request ,urlopen等语法

    #第二种思路
    import urllib.parse
    import urllib.request
    
    #data ="__VIEWSTATE=BPyDUNYWaU0XKZc1m%2F%2Bhb4U4rlITcqsuUMRxGp8elUR%2FZgK5owxYH030QThFNas5u%2FroAT5kBYSSMaMYwRFEcw1COVEwFNapeyfnLq2PZ98%3D&__VIEWSTATEGENERATOR=B6E7D48B&__EVENTVALIDATION=3d%2BF%2FuurjOe90j09vxq1PU5Tli3EVssuM%2FHOISX5tyLUP2HdR956%2B2PC8ezRt2kDXMr1BDkstuMTV11I0laFwL%2BadHjgxQYtfEObtF%2FNligPunXkuGz4LuJ6VYdtACGg&Button1=Button"
    
    str = {
            '__VIEWSTATE': 'i8VVE3gtBLKgjBpYFwMCruW86sOjv2lTpmzZF3mD3L/QIkX0Ode3Xc9MaMXFQnjiAK80xxkQ9rjTyjGrBWvbLwcxug4r9Akhmcxs/plCdYM=',
            '__VIEWSTATEGENERATOR':'B6E7D48B',
            '__EVENTVALIDATION':'2X6ageL+LlYeiTSgyQPd/FPAhPtg350MNiiKvURoS4xMsysX+0HyGjGN93yx7K27/NubbDHt2oTpWbFCv1dampTLrZkWvsf32lHlqJUliAsudoGhZ3kwM/XCxXSlvhNr',
            'Button1':'Button'
    }
    
    # headers={
    # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
    # }
    # str = {'__VIEWSTATE': 'i8VVE3gtBLKgjBpYFwMCruW86sOjv2lTpmzZF3mD3L/QIkX0Ode3Xc9MaMXFQnjiAK80xxkQ9rjTyjGrBWvbLwcxug4r9Akhmcxs/plCdYM=','__VIEWSTATEGENERATOR':'B6E7D48B','__EVENTVALIDATION':'2X6ageL+LlYeiTSgyQPd/FPAhPtg350MNiiKvURoS4xMsysX+0HyGjGN93yx7K27/NubbDHt2oTpWbFCv1dampTLrZkWvsf32lHlqJUliAsudoGhZ3kwM/XCxXSlvhNr','Button1':'Button'}
    
    data = bytes(urllib.parse.urlencode(str),encoding='utf-8')
    url = "http://192.168.21.195:8044/WebForm1"
    # request = urllib.request.Request(url=url,data=data,headers=headers,origin_req_host='192.168.21.195:8044',unverifiable=True,method='POST')
    #第一种写法
    # request = urllib.request.Request(url=url,data=data)
    # response = urllib.request.urlopen(request)
    # print(response.read().decode('utf-8'))
    #第二种写法
    response = urllib.request.urlopen(url,data=data)
    print(response.read().decode('utf-8'))
    # print(response.read()) 不设置编码格式是不能拿到想要的数据的
    

    太乱了,整理一下把调试代码拿掉

    #第二种思路
    import urllib.parse
    import urllib.request
    
    str = {
            '__VIEWSTATE': 'i8VVE3gtBLKgjBpYFwMCruW86sOjv2lTpmzZF3mD3L/QIkX0Ode3Xc9MaMXFQnjiAK80xxkQ9rjTyjGrBWvbLwcxug4r9Akhmcxs/plCdYM=',
            '__VIEWSTATEGENERATOR':'B6E7D48B',
            '__EVENTVALIDATION':'2X6ageL+LlYeiTSgyQPd/FPAhPtg350MNiiKvURoS4xMsysX+0HyGjGN93yx7K27/NubbDHt2oTpWbFCv1dampTLrZkWvsf32lHlqJUliAsudoGhZ3kwM/XCxXSlvhNr',
            'Button1':'Button'
    }
    
    data = bytes(urllib.parse.urlencode(str),encoding='utf-8')
    url = "http://192.168.21.195:8044/WebForm1"
    
    #第二种写法
    response = urllib.request.urlopen(url,data=data)
    print(response.read().decode('utf-8'))
    

     

    运行效果

    再优化下正则去除多余html标签

    import re
    
    resstr=response.read().decode('utf-8')
    new_st = re.sub(r'<[^>]*>','',resstr)
    print(new_st.split())
    

      

    总结:1,测试后不需要添加头文件。

                2,抓数据方面,python语法比C#语法简单.

                3,python语法注意事项:data里面是键值队形式和json一致

  • 相关阅读:
    扩增子分析解读7物种分类统计 筛选进化树和其它
    R函数详解
    扩增子统计绘图1箱线图:Alpha多样性
    扩增子分析解读6进化树 Alpha Beta多样性
    扩增子分析解读5物种注释 OTU表操作
    扩增子分析解读4去嵌合体 非细菌序列 生成代表性序列和OTU表
    扩增子分析解读3格式转换 去冗余 聚类
    扩增子分析解读2提取barcode 质控及样品拆分 切除扩增引物
    执行join_paired_ends.py报错Cannot find fastq-join
    扩增子分析解读1质控 实验设计 双端序列合并
  • 原文地址:https://www.cnblogs.com/suntanyong88/p/12426758.html
Copyright © 2011-2022 走看看