zoukankan      html  css  js  c++  java
  • Asp.net 使用正则和网络编程抓取网页数据(有用)

    Asp.net 使用正则和网络编程抓取网页数据(有用)

    Asp.net 使用正则和网络编程抓取网页数据(有用)

            /// <summary>
            /// 抓取网页对应内容
            /// </summary>
            /// <param name="strUrl">採集地址</param>
            /// <param name="Begin">開始字符</param>
            /// <param name="End">结束字符</param>
            /// <returns></returns>
            private static String GetContent(String strUrl, String Begin, String End)
            {
                String result = String.Empty;
                HttpWebRequest request = (HttpWebRequest)WebRequest.Create(strUrl);
                HttpWebResponse response = (HttpWebResponse)request.GetResponse();
                using (StreamReader reader = new StreamReader(response.GetResponseStream(), System.Text.Encoding.Default))
                {
                    result = reader.ReadToEnd();
                    
                    reader.Close();
                    response.Close();
                }
    
                //抓取内容
                Match table = Regex.Match(result, "(?<=" + Begin + ")[\s\S]*?

    (?

    =" + End + ")", RegexOptions.IgnoreCase); result = NoHTML(table.Value); return result; } ///<summary> ///去除HTML标记 ///</summary> ///<param name="NoHTML">包含HTML的源代码 </param> ///<returns>已经去除后的文字</returns> private static string NoHTML(string Htmlstring) { //删除脚本 Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?

    >.*?

    </script>", "", RegexOptions.IgnoreCase); //删除HTML Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"([ ])[s]+", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", """, RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "xa1", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "xa2", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "xa3", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "xa9", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&#(d+);", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, ">", ""); Htmlstring = Regex.Replace(Htmlstring, "<", ""); Htmlstring = Regex.Replace(Htmlstring, " ", ""); Htmlstring = Htmlstring.Substring(Htmlstring.IndexOf(" ") + 1); if (Htmlstring.LastIndexOf("'") >= 0) Htmlstring = Htmlstring.Substring(Htmlstring.LastIndexOf("'") + 1); if (Htmlstring.IndexOf("class='tdbk'") >= 0) Htmlstring = Htmlstring.Substring(Htmlstring.IndexOf("class='tdbk'") + "class='tdbk'".Length); return Htmlstring; }



  • 相关阅读:
    CSP-S 代码基本框架
    Gradle build finished with 100 error(s) in 14s 629ms
    opencv2.3. 9+vs2012
    ButterKnife-- ButterKnife.bind(this); @BindView(R.id.bottomSelectView) BottomSelectView bottomSelectView;
    递归实现数组求和
    data structure begin!!
    递归实现全排列算法-161029
    简单粗暴-文件拓展名+任务管理器
    try to write a server
    在TextView中实时显示数据
  • 原文地址:https://www.cnblogs.com/hrhguanli/p/5058231.html
Copyright © 2011-2022 走看看