zoukankan html css js c++ java

asp.net 网页抓取内容

网页抓取代码

using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
//
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
using System.Text;

namespace WSYL.Web.Common
{
    public static class GetSteamShipInfo
    {
        public static string GetWebSite(string steamshipname,int itype)
        {
            if (steamshipname == null || steamshipname.Trim() == "")
                return null;
            //step1: get html from url
            string urlToCrawl = @"网址";
            //generate http request
            HttpWebRequest req = (HttpWebRequest)WebRequest.Create(urlToCrawl);
            //use GET method to get url's html
            req.Method = "GET";
            //use request to get response
            HttpWebResponse resp = (HttpWebResponse)req.GetResponse();
            // 二〇一五年八月十二日 18:14:45 需要增加判断网页解析超时问题 防止网页假死
            // string htmlCharset = "UTF-8";
            string htmlCharset = "utf-8";
            //use songtaste's html's charset GB2312 to decode html
            //otherwise will return messy code
            Encoding htmlEncoding = Encoding.GetEncoding(htmlCharset);
            StreamReader sr = new StreamReader(resp.GetResponseStream(), htmlEncoding);
            //read out the returned html
            string respHtml = sr.ReadToEnd();
            //第三种获取内容
            //Match TitleMatch = Regex.Match(rtbExtractedHtml.Text.ToString(), "<td width="30%">([^<]*)</td>", RegexOptions.IgnoreCase | RegexOptions.Multiline);
//需要获取的代码开始和结尾内容

            Match TitleMatch2 = Regex.Match(respHtml.ToString(), "<td align="left" bgcolor="#EEEEEE">([^<]*)</td>", RegexOptions.IgnoreCase | RegexOptions.Multiline);
            // txbExtractedInfo.Text = TitleMatch2.Groups[1].Value+"/"+ TitleMatch2.Groups[2].Value;
            if (TitleMatch2.Groups[1].Value.Length == 0 || TitleMatch2.Groups[1].Value=="")
               return respHtml = "";
            if(itype==0)
            {
                respHtml = TitleMatch2.Groups[1].Value.ToString();
            }
             if(itype==1)
            {
                respHtml = StripHtml(TitleMatch2.NextMatch().Value.ToString());
            }
             if (itype == 2)
             {
                 respHtml = TitleMatch2.Groups[1].Value + "/" + StripHtml(TitleMatch2.NextMatch().Value.ToString());
             }
            return  respHtml;
        }
        /// <summary>
        /// 去除html标签和空格有些例外会使得去除不干净，所以建议连续两次转化。这样将Html标签转化为了空格。太多连续的空格会影响之后对字符串的操作
        /// </summary>
        /// <param name="strHtml">标签内容</param>
        /// <returns></returns>
        private static string StripHtml(string strHtml)
        {
            Regex objRegExp = new Regex("<(.|
)+?>");
            string strOutput = objRegExp.Replace(strHtml, "");
            strOutput = strOutput.Replace("<", "&lt;");
            strOutput = strOutput.Replace(">", "&gt;");
            //把所有空格变为一个空格
            Regex r = new Regex(@"s+");
            strOutput = r.Replace(strOutput, " ");
            return strOutput.Trim();
        }
    }
}

查看全文

相关阅读:
codevs1044 拦截导弹(最长不下降子序列dp)
codevs1014 装箱问题(DP)
codevs1068 乌龟棋(DP)
angular.extend vs angular.copy
angular input框点击别处变成不可输入状态
 angular select框 option空行
 angular ui 路由传参
 setTimeout 传参
 设置请求头信息的不同方式
 vertical-align

原文地址：https://www.cnblogs.com/zangdalei/p/5329428.html