zoukankan      html  css  js  c++  java
  • c#抓取网站数据

    string url = "http://www.123.com/fast_view?a=1&gameId=25&areaId=0&serverId=0";
                string reffer = "http://www.123.com/";
    
                string html = GetHTML(url,reffer,10*1000);
                int tableStart = html.IndexOf("<table");
                int tableEnd = html.IndexOf("</table>");
    
                DataTable dt = new DataTable();
                dt.Columns.Add("ServerName", typeof(System.String));
                dt.Columns.Add("GoodsName", typeof(System.String));
                dt.Columns.Add("Price", typeof(System.String));
                dt.Columns.Add("Qty", typeof(System.String));
                dt.Columns.Add("Id", typeof(System.String));
    
                if (tableStart != -1 && tableEnd!=-1 && tableEnd>tableStart)
                {
                    string tableHtml = html.Substring(tableStart, tableEnd - tableStart + 8);
                    System.Text.RegularExpressions.MatchCollection trs = System.Text.RegularExpressions.Regex.Matches(html, "<tr[^>]*>(.*?)</tr>", System.Text.RegularExpressions.RegexOptions.Singleline | System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    for (int i = 0; i < trs.Count; i++)
                    {
                        System.Text.RegularExpressions.MatchCollection tds = System.Text.RegularExpressions.Regex.Matches(trs[i].Value, "<td[^>]*>(.*?)</td>", System.Text.RegularExpressions.RegexOptions.Singleline | System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                        if (tds.Count < 8) continue;
                        DataRow dr = dt.NewRow();
                        dr["ServerName"] = System.Text.RegularExpressions.Regex.Match(tds[0].Value, @"SelfTextCut2('([^']*)'", System.Text.RegularExpressions.RegexOptions.IgnoreCase | System.Text.RegularExpressions.RegexOptions.Singleline).Groups[1].Value;
                        dr["GoodsName"] = System.Text.RegularExpressions.Regex.Match(tds[2].Value, @"SelfTextCut2('([^']*)'", System.Text.RegularExpressions.RegexOptions.IgnoreCase | System.Text.RegularExpressions.RegexOptions.Singleline).Groups[1].Value;
                        dr["Price"] = System.Text.RegularExpressions.Regex.Match(tds[5].Value, @"parseFloat(([^)]*))", System.Text.RegularExpressions.RegexOptions.IgnoreCase | System.Text.RegularExpressions.RegexOptions.Singleline).Groups[1].Value;
                        dr["Qty"] = System.Text.RegularExpressions.Regex.Replace(tds[6].Value, "<[^>]*>", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase | System.Text.RegularExpressions.RegexOptions.Singleline);
                        dr["Id"] = System.Text.RegularExpressions.Regex.Match(tds[7].Value, @"dl('(d+)')", System.Text.RegularExpressions.RegexOptions.IgnoreCase | System.Text.RegularExpressions.RegexOptions.Singleline).Groups[1].Value;
                        dt.Rows.Add(dr);
                    }
                }
    
    ================================================================================================================================
    
    附上一个小小的GETHTML,嘎嘎
    
    public static string GetHTML(string strUrl, string Reffer, int Timeout)
            {
                try
                {
                    //构造请求
                    HttpWebRequest request = (HttpWebRequest)WebRequest.Create(strUrl);
    
                    request.Method = "GET";
                    request.ServicePoint.Expect100Continue = false;
    
                    //请求头
                    request.UserAgent = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.72 Safari/537.36";
                    request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
                    request.Headers.Add("Accept-Language", "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3");
                    request.Headers.Add("Accept-Encoding", "gzip, deflate");
                    request.KeepAlive = false;
    
                    request.Referer = Reffer;
                    request.Timeout = Timeout;
    
                    HttpWebResponse response = (HttpWebResponse)request.GetResponse();
                    Stream stream = null;
                    if (response.ContentEncoding == "gzip")
                    {
                        System.IO.Compression.GZipStream gzsStream = new System.IO.Compression.GZipStream(response.GetResponseStream(), System.IO.Compression.CompressionMode.Decompress);
                        stream = gzsStream;
                    }
                    else
                    {
                        stream = response.GetResponseStream();
                    }
    
                    string strResult = new StreamReader(stream, System.Text.Encoding.GetEncoding("gb2312")).ReadToEnd();
                    response.Close();
    
                    return strResult;
                }
                catch (Exception err)
                {
                    return "Error:" + err.ToString();
                }
            }
  • 相关阅读:
    jvm调优监控工具jps、jstack、jmap、jhat、jstat使用详解
    JS中Date和时间戳转换
    HashMap源码窥探
    HashMap,TreeMap,LinkedHashMap的默认排序
    hibernate-delete(Entity)的顺序问题
    macOS通过ssh使用PEM登录
    SpringMvc-<context:component-scan>使用说明
    macOS安装RZ,SZ
    CentOS6.8下yum安装Nginx
    第K人||约瑟夫环(链表)
  • 原文地址:https://www.cnblogs.com/ghelement/p/4512012.html
Copyright © 2011-2022 走看看