zoukankan      html  css  js  c++  java
  • C#自动登录网页浏览页面 抓取数据


    需求:客户的数据同时存在在另外一个不可控的系统中,需要和当前系统同步。

    思路:自动登录另外一个系统,然后抓取数据,同步到本系统中。

    技术点:模拟用户登录;保存登录状态;抓取数据

     

    程序非常简单

         /// <summary>

            /// visit the target url

            /// </summary>

            /// <param name="targetURL"></param>

            /// <param name="cc">this is for keeping cookies and sessions</param>

            /// <param name="param">this is the data need post inside form</param>

            /// <returns>html page</returns>

            public static string PostAndGetHTML(string targetURL,CookieContainer cc, Hashtable param)

            {

                //prepare the submit data

                string formData = "";

                foreach (DictionaryEntry de in param)

                {

                    formData += de.Key.ToString() + "=" + de.Value.ToString()+"&";

                }

                if(formData.Length>0)

                   formData = formData.Substring(0, formData.Length - 1); //remove last '&'

     

                ASCIIEncoding encoding = new ASCIIEncoding();

                byte[] data = encoding.GetBytes(formData);

     

                HttpWebRequest request = (HttpWebRequest)WebRequest.Create(targetURL);

                request.Method = "POST";    //post

                request.ContentType = "application/x-www-form-urlencoded";

                request.ContentLength = data.Length;

                request.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 2.0.1124)";

                

                Stream newStream = request.GetRequestStream();

                newStream.Write(data, 0, data.Length);

     

                newStream.Close();

     

                request.CookieContainer = cc;

                HttpWebResponse response = (HttpWebResponse)request.GetResponse();

                cc.Add(response.Cookies);

                Stream stream = response.GetResponseStream();

                string result = new StreamReader(stream, System.Text.Encoding.Default).ReadToEnd();

                return result;

            }

    这一个是调用的例子:先登录,在查询。 实际中这个逻辑可能有很多步骤


            private void button2_Click(object sender, EventArgs e)

            {

                CookieContainer cc = new CookieContainer();//this is for keep the Session and Cookie

                Hashtable param = new Hashtable();//this is for keep post data.

     

                string urlLogin = "http://demo.server//login.asp";

                 //do find the elementId that needed. check the source of login page can get this information

                param.Add("User", "xxx");

                param.Add("Password", "xxxx");

                string result = PostAndGetHTML(urlLogin, cc, param);

                //check result, whether login success

              

                //if login success, goto the target url, and input some value.

                string url2 = " http://demo.server/query.asp?id=1";// need change. special logic

                param.Clear();

                //param.Add("SearchAreaId","JobId")

                result = PostAndGetHTML(url2, cc, new Hashtable());

                //ConvertToDT the html or do something others

     

     

            }

    这是一个简单的抓取网页数据的函数(针对Table内的,直接转化成DataTable


            private DataTable ConvertToDT(DataTable dt, string tableHTML)

            {

     

                int lastTD = tableHTML.ToLower().LastIndexOf("</td>");

                int firstRow = tableHTML.ToLower().IndexOf("<tr") + 3;//after ""<tr

                int index = tableHTML.ToLower().IndexOf("<tr", firstRow) + 3;//after ""<tr

                while (index < lastTD)

                {

                    DataRow dr = dt.NewRow();

                    for (int i = 0; i < dt.Columns.Count; i++)

                    {

                        string value = "";

                        int startTD = tableHTML.ToLower().IndexOf("<td", index) + 3;//after "<td"

                        int endTD = tableHTML.ToLower().IndexOf("</td>", startTD);

                        if (endTD < 0)

                            break;

                        string tdStr = tableHTML.Substring(startTD, endTD - startTD);

                       

                        //remove <> and others

                        tdStr = tdStr.Replace("&nbsp;", "").Replace("\t", "").Replace("\r", "");

                        string[] v = tdStr.Split('<', '>');

                        for (int j = 0; j < v.Length; j++)

                        {

                            j++;

                            if (v[j].Trim() != "")

                            {

                                value = v[j].Trim();

                                break;

                            } 

                        }

                        //

                        dr[i] = value;

                        index = endTD;

                    }

                    dt.Rows.Add(dr);

     

                }

                return dt;

            }



    注:对于有验证码登录系统的无效。(如果该系统的验证码放到
    cookie中存储的例外,这个容易破解)

  • 相关阅读:
    maven项目诡异的问题
    13) Developing Java Plugins
    15) maven dependency scope
    Bootstrap学习记录
    电力
    MongoDB学习记录
    Java基础知识
    旅游
    人生感悟
    【转】25岁到55岁:如何规划人生最重要的三个十年
  • 原文地址:https://www.cnblogs.com/netwom/p/953430.html
Copyright © 2011-2022 走看看