zoukankan      html  css  js  c++  java
  • 使用C#抓取网页内容并分析获取数据

        private void button5_Click(object sender, EventArgs e)
        {
            string html = "";
            WebHeaderCollection header = new WebHeaderCollection();
            header.Set("Pragma", "no-cache");
            html = getHtml("http://www.biomart.cn/info/infoDemand.htm?pge=1", header);
            Regex regex = new Regex("<!-- 列表 -->(?<1>.*)<!-- /列表 -->");
            //MessageBox.Show(regex.Match(html).Groups.Count.ToString());  
            html = regex.Match(html).Groups[1].Value;
            regex = new Regex("href="(?<1>http://www\.biomart\.cn/infodemand/\w+\.htm)"");
            MatchCollection ms = regex.Matches(html);
            header.Set(HttpRequestHeader.Cookie, "__utma=124945049.1686326021.1305093063.1305164868.1305187067.3; __utmz=124945049.1305093063.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); JSESSIONID=9D7F7F4B5D73F453DA54B40A53D5E7C8; __utmc=124945049; __utmb=124945049.2.10.1305187067");
            foreach (Match m in ms)
            {
                MessageBox.Show(m.Groups[1].Value);
                String content = getHtml(m.Groups[1].Value, header);
                regex = new Regex("<div class="product_card">(?<1>.*)\s+</p>\s+</div>");
                MessageBox.Show(regex.Match(content).Groups[1].Value);
            }
        }
    
    
        private String getHtml(String url, WebHeaderCollection header)
        {
            WebHeaderCollection header = new WebHeaderCollection();
            header.Set("", "");
            header.Set(HttpRequestHeader.Cookie, "");
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create("http://www.biomart.cn/info/infoDemand.htm?pge=1");
            request.Timeout = 30000;
            request.Headers = header;
            HttpWebResponse response = (HttpWebResponse)request.GetResponse();
            Stream stream = response.GetResponseStream();
            Encoding encoding = Encoding.GetEncoding("UTF-8");
            StreamReader reader = new StreamReader(stream);
            String content = reader.ReadToEnd();
            content = Regex.Replace(content, "\t|\r|\n", "");
            return content;
        }
  • 相关阅读:
    移动端解决fixed和input弹出虚拟键盘时样式错位
    JS的面向对象
    js计算两个时间范围间的间隔秒数
    使用js过滤字符串前后的空格
    C#时间格式-摘自http://www.cnblogs.com/xiaogongzhu/p/3825600.html
    [dp/贪心]435. 无重叠区间-----经典问题
    【dp】Leetcode面试题 17.16. 按摩师
    [dp]Leetcode.376.摆动序列
    Leetcode 945 使数组唯一的最小增量
    LeetCode 365.水壶问题
  • 原文地址:https://www.cnblogs.com/dennys/p/3400301.html
Copyright © 2011-2022 走看看