zoukankan      html  css  js  c++  java
  • Asp.net 使用正则和网络编程抓取网页数据(有用)

    Asp.net 使用正则和网络编程抓取网页数据(有用)

    Asp.net 使用正则和网络编程抓取网页数据(有用)

            /// <summary>
            /// 抓取网页对应内容
            /// </summary>
            /// <param name="strUrl">採集地址</param>
            /// <param name="Begin">開始字符</param>
            /// <param name="End">结束字符</param>
            /// <returns></returns>
            private static String GetContent(String strUrl, String Begin, String End)
            {
                String result = String.Empty;
                HttpWebRequest request = (HttpWebRequest)WebRequest.Create(strUrl);
                HttpWebResponse response = (HttpWebResponse)request.GetResponse();
                using (StreamReader reader = new StreamReader(response.GetResponseStream(), System.Text.Encoding.Default))
                {
                    result = reader.ReadToEnd();
                    
                    reader.Close();
                    response.Close();
                }
    
                //抓取内容
                Match table = Regex.Match(result, "(?<=" + Begin + ")[\s\S]*?

    (?

    =" + End + ")", RegexOptions.IgnoreCase); result = NoHTML(table.Value); return result; } ///<summary> ///去除HTML标记 ///</summary> ///<param name="NoHTML">包含HTML的源代码 </param> ///<returns>已经去除后的文字</returns> private static string NoHTML(string Htmlstring) { //删除脚本 Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?

    >.*?

    </script>", "", RegexOptions.IgnoreCase); //删除HTML Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"([ ])[s]+", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", """, RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "xa1", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "xa2", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "xa3", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "xa9", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&#(d+);", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, ">", ""); Htmlstring = Regex.Replace(Htmlstring, "<", ""); Htmlstring = Regex.Replace(Htmlstring, " ", ""); Htmlstring = Htmlstring.Substring(Htmlstring.IndexOf(" ") + 1); if (Htmlstring.LastIndexOf("'") >= 0) Htmlstring = Htmlstring.Substring(Htmlstring.LastIndexOf("'") + 1); if (Htmlstring.IndexOf("class='tdbk'") >= 0) Htmlstring = Htmlstring.Substring(Htmlstring.IndexOf("class='tdbk'") + "class='tdbk'".Length); return Htmlstring; }



  • 相关阅读:
    第一册:lesson forty five。
    第一册:lesson forty three。
    马化腾2015港大演讲。
    Swing实现文件选择(目录选择)附导出
    SVN强制注释
    Websphere内存溢出的日志
    sql server2008 搭建链接服务器成功后查询时报Cannot obtain the schema rowset "DBSCHEMA_TABLES_INFO" for OLE DB provider "SQLNCLI10" for linked server "XXXXX". 的解决方法
    UML图例
    jSP的3种方式实现radio ,checkBox,select的默认选择值。
    通过js子页面回写父页面,改变父页面控件的值
  • 原文地址:https://www.cnblogs.com/hrhguanli/p/5058231.html
Copyright © 2011-2022 走看看