zoukankan      html  css  js  c++  java
  • asp.net 网页抓取内容

    网页抓取代码

    复制代码
    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Web;
    //
    using System.Net;
    using System.IO;
    using System.Text.RegularExpressions;
    using System.Text;
    
    namespace WSYL.Web.Common
    {
        public static class GetSteamShipInfo
        {
            public static string GetWebSite(string steamshipname,int itype)
            {
                if (steamshipname == null || steamshipname.Trim() == "")
                    return null;
                //step1: get html from url
                string urlToCrawl = @"网址";
                //generate http request
                HttpWebRequest req = (HttpWebRequest)WebRequest.Create(urlToCrawl);
                //use GET method to get url's html
                req.Method = "GET";
                //use request to get response
                HttpWebResponse resp = (HttpWebResponse)req.GetResponse();
                // 二〇一五年八月十二日 18:14:45 需要增加判断网页解析超时问题 防止网页假死
                // string htmlCharset = "UTF-8";
                string htmlCharset = "utf-8";
                //use songtaste's html's charset GB2312 to decode html
                //otherwise will return messy code
                Encoding htmlEncoding = Encoding.GetEncoding(htmlCharset);
                StreamReader sr = new StreamReader(resp.GetResponseStream(), htmlEncoding);
                //read out the returned html
                string respHtml = sr.ReadToEnd();
                //第三种获取内容
                //Match TitleMatch = Regex.Match(rtbExtractedHtml.Text.ToString(), "<td width=\"30%\">([^<]*)</td>", RegexOptions.IgnoreCase | RegexOptions.Multiline);
    //需要获取的代码开始和结尾内容
    Match TitleMatch2 = Regex.Match(respHtml.ToString(), "<td align=\"left\" bgcolor=\"#EEEEEE\">([^<]*)</td>", RegexOptions.IgnoreCase | RegexOptions.Multiline); // txbExtractedInfo.Text = TitleMatch2.Groups[1].Value+"/"+ TitleMatch2.Groups[2].Value; if (TitleMatch2.Groups[1].Value.Length == 0 || TitleMatch2.Groups[1].Value=="") return respHtml = ""; if(itype==0) { respHtml = TitleMatch2.Groups[1].Value.ToString(); } if(itype==1) { respHtml = StripHtml(TitleMatch2.NextMatch().Value.ToString()); } if (itype == 2) { respHtml = TitleMatch2.Groups[1].Value + "/" + StripHtml(TitleMatch2.NextMatch().Value.ToString()); } return respHtml; } /// <summary> /// 去除html标签和空格有些例外会使得去除不干净,所以建议连续两次转化。这样将Html标签转化为了空格。太多连续的空格会影响之后对字符串的操作 /// </summary> /// <param name="strHtml">标签内容</param> /// <returns></returns> private static string StripHtml(string strHtml) { Regex objRegExp = new Regex("<(.|\n)+?>"); string strOutput = objRegExp.Replace(strHtml, ""); strOutput = strOutput.Replace("<", "&lt;"); strOutput = strOutput.Replace(">", "&gt;"); //把所有空格变为一个空格 Regex r = new Regex(@"\s+"); strOutput = r.Replace(strOutput, " "); return strOutput.Trim(); } } }
    复制代码
    走在通往梦想国度的路上,加油!
  • 相关阅读:
    codeforces 455B A Lot of Games(博弈,字典树)
    HDU 4825 Xor Sum(二进制的字典树,数组模拟)
    hdu 1800 Flying to the Mars(简单模拟,string,字符串)
    codeforces 425A Sereja and Swaps(模拟,vector,枚举区间)
    codeforces 425B Sereja and Table(状态压缩,也可以数组模拟)
    HDU 4148 Length of S(n)(字符串)
    codeforces 439D Devu and Partitioning of the Array(有深度的模拟)
    浅谈sass
    京东楼层案例思维逻辑分析
    浅谈localStorage和sessionStorage
  • 原文地址:https://www.cnblogs.com/hs8888/p/5520564.html
Copyright © 2011-2022 走看看