zoukankan      html  css  js  c++  java
  • asp.net 网页抓取内容

    网页抓取代码

    复制代码
    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Web;
    //
    using System.Net;
    using System.IO;
    using System.Text.RegularExpressions;
    using System.Text;
    
    namespace WSYL.Web.Common
    {
        public static class GetSteamShipInfo
        {
            public static string GetWebSite(string steamshipname,int itype)
            {
                if (steamshipname == null || steamshipname.Trim() == "")
                    return null;
                //step1: get html from url
                string urlToCrawl = @"网址";
                //generate http request
                HttpWebRequest req = (HttpWebRequest)WebRequest.Create(urlToCrawl);
                //use GET method to get url's html
                req.Method = "GET";
                //use request to get response
                HttpWebResponse resp = (HttpWebResponse)req.GetResponse();
                // 二〇一五年八月十二日 18:14:45 需要增加判断网页解析超时问题 防止网页假死
                // string htmlCharset = "UTF-8";
                string htmlCharset = "utf-8";
                //use songtaste's html's charset GB2312 to decode html
                //otherwise will return messy code
                Encoding htmlEncoding = Encoding.GetEncoding(htmlCharset);
                StreamReader sr = new StreamReader(resp.GetResponseStream(), htmlEncoding);
                //read out the returned html
                string respHtml = sr.ReadToEnd();
                //第三种获取内容
                //Match TitleMatch = Regex.Match(rtbExtractedHtml.Text.ToString(), "<td width=\"30%\">([^<]*)</td>", RegexOptions.IgnoreCase | RegexOptions.Multiline);
    //需要获取的代码开始和结尾内容
    Match TitleMatch2 = Regex.Match(respHtml.ToString(), "<td align=\"left\" bgcolor=\"#EEEEEE\">([^<]*)</td>", RegexOptions.IgnoreCase | RegexOptions.Multiline); // txbExtractedInfo.Text = TitleMatch2.Groups[1].Value+"/"+ TitleMatch2.Groups[2].Value; if (TitleMatch2.Groups[1].Value.Length == 0 || TitleMatch2.Groups[1].Value=="") return respHtml = ""; if(itype==0) { respHtml = TitleMatch2.Groups[1].Value.ToString(); } if(itype==1) { respHtml = StripHtml(TitleMatch2.NextMatch().Value.ToString()); } if (itype == 2) { respHtml = TitleMatch2.Groups[1].Value + "/" + StripHtml(TitleMatch2.NextMatch().Value.ToString()); } return respHtml; } /// <summary> /// 去除html标签和空格有些例外会使得去除不干净,所以建议连续两次转化。这样将Html标签转化为了空格。太多连续的空格会影响之后对字符串的操作 /// </summary> /// <param name="strHtml">标签内容</param> /// <returns></returns> private static string StripHtml(string strHtml) { Regex objRegExp = new Regex("<(.|\n)+?>"); string strOutput = objRegExp.Replace(strHtml, ""); strOutput = strOutput.Replace("<", "&lt;"); strOutput = strOutput.Replace(">", "&gt;"); //把所有空格变为一个空格 Regex r = new Regex(@"\s+"); strOutput = r.Replace(strOutput, " "); return strOutput.Trim(); } } }
    复制代码
    走在通往梦想国度的路上,加油!
  • 相关阅读:
    SVN还原项目到某一版本(转)
    C# Web Service 不使用服务引用直接调用方法(转)
    动态调用webservice时 ServiceDescriptionImporter类在vs2010无法引用的解决方法 (转)
    log4net示例2-日志输入存入Access(转)
    C# log4net 配置及使用详解--日志保存到文件和Access(转)
    未能解析引用的程序集......因为它对不在当前目标框架“.NETFramework,Version=v4.0,Profile=Client”中的 (转)
    Hello log4net——做一个实用好用的log4net的demo(转)
    JS移动客户端--触屏滑动事件
    js生成二维码实例
    触屏版类似刷新页面文本框获取焦点的同时弹出手机键盘的做法
  • 原文地址:https://www.cnblogs.com/hs8888/p/5520564.html
Copyright © 2011-2022 走看看