zoukankan      html  css  js  c++  java
  • asp.net 网页抓取内容

    网页抓取代码

    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Web;
    //
    using System.Net;
    using System.IO;
    using System.Text.RegularExpressions;
    using System.Text;
    
    namespace WSYL.Web.Common
    {
        public static class GetSteamShipInfo
        {
            public static string GetWebSite(string steamshipname,int itype)
            {
                if (steamshipname == null || steamshipname.Trim() == "")
                    return null;
                //step1: get html from url
                string urlToCrawl = @"网址";
                //generate http request
                HttpWebRequest req = (HttpWebRequest)WebRequest.Create(urlToCrawl);
                //use GET method to get url's html
                req.Method = "GET";
                //use request to get response
                HttpWebResponse resp = (HttpWebResponse)req.GetResponse();
                // 二〇一五年八月十二日 18:14:45 需要增加判断网页解析超时问题 防止网页假死
                // string htmlCharset = "UTF-8";
                string htmlCharset = "utf-8";
                //use songtaste's html's charset GB2312 to decode html
                //otherwise will return messy code
                Encoding htmlEncoding = Encoding.GetEncoding(htmlCharset);
                StreamReader sr = new StreamReader(resp.GetResponseStream(), htmlEncoding);
                //read out the returned html
                string respHtml = sr.ReadToEnd();
                //第三种获取内容
                //Match TitleMatch = Regex.Match(rtbExtractedHtml.Text.ToString(), "<td width="30%">([^<]*)</td>", RegexOptions.IgnoreCase | RegexOptions.Multiline);
    //需要获取的代码开始和结尾内容
    Match TitleMatch2 = Regex.Match(respHtml.ToString(), "<td align="left" bgcolor="#EEEEEE">([^<]*)</td>", RegexOptions.IgnoreCase | RegexOptions.Multiline); // txbExtractedInfo.Text = TitleMatch2.Groups[1].Value+"/"+ TitleMatch2.Groups[2].Value; if (TitleMatch2.Groups[1].Value.Length == 0 || TitleMatch2.Groups[1].Value=="") return respHtml = ""; if(itype==0) { respHtml = TitleMatch2.Groups[1].Value.ToString(); } if(itype==1) { respHtml = StripHtml(TitleMatch2.NextMatch().Value.ToString()); } if (itype == 2) { respHtml = TitleMatch2.Groups[1].Value + "/" + StripHtml(TitleMatch2.NextMatch().Value.ToString()); } return respHtml; } /// <summary> /// 去除html标签和空格有些例外会使得去除不干净,所以建议连续两次转化。这样将Html标签转化为了空格。太多连续的空格会影响之后对字符串的操作 /// </summary> /// <param name="strHtml">标签内容</param> /// <returns></returns> private static string StripHtml(string strHtml) { Regex objRegExp = new Regex("<(.| )+?>"); string strOutput = objRegExp.Replace(strHtml, ""); strOutput = strOutput.Replace("<", "&lt;"); strOutput = strOutput.Replace(">", "&gt;"); //把所有空格变为一个空格 Regex r = new Regex(@"s+"); strOutput = r.Replace(strOutput, " "); return strOutput.Trim(); } } }
  • 相关阅读:
    结对编程之附加题:单元测试
    机器学习第二次作业
    第一次作业
    机器学习第二次作业
    机器学习第一次个人作业
    软工实践个人总结
    第08组 Beta版本演示
    第08组 Beta冲刺(5/5)
    第08组 Beta冲刺(4/5)
    第08组 Beta冲刺(3/5)
  • 原文地址:https://www.cnblogs.com/zangdalei/p/5329428.html
Copyright © 2011-2022 走看看