zoukankan      html  css  js  c++  java
  • Asp.net 使用正则和网络编程抓取网页数据(有用)

    Asp.net 使用正则和网络编程抓取网页数据(有用)

    Asp.net 使用正则和网络编程抓取网页数据(有用)

            /// <summary>
            /// 抓取网页对应内容
            /// </summary>
            /// <param name="strUrl">採集地址</param>
            /// <param name="Begin">開始字符</param>
            /// <param name="End">结束字符</param>
            /// <returns></returns>
            private static String GetContent(String strUrl, String Begin, String End)
            {
                String result = String.Empty;
                HttpWebRequest request = (HttpWebRequest)WebRequest.Create(strUrl);
                HttpWebResponse response = (HttpWebResponse)request.GetResponse();
                using (StreamReader reader = new StreamReader(response.GetResponseStream(), System.Text.Encoding.Default))
                {
                    result = reader.ReadToEnd();
                    
                    reader.Close();
                    response.Close();
                }
    
                //抓取内容
                Match table = Regex.Match(result, "(?<=" + Begin + ")[\s\S]*?

    (?

    =" + End + ")", RegexOptions.IgnoreCase); result = NoHTML(table.Value); return result; } ///<summary> ///去除HTML标记 ///</summary> ///<param name="NoHTML">包含HTML的源代码 </param> ///<returns>已经去除后的文字</returns> private static string NoHTML(string Htmlstring) { //删除脚本 Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?

    >.*?

    </script>", "", RegexOptions.IgnoreCase); //删除HTML Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"([ ])[s]+", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", """, RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "xa1", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "xa2", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "xa3", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "xa9", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&#(d+);", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, ">", ""); Htmlstring = Regex.Replace(Htmlstring, "<", ""); Htmlstring = Regex.Replace(Htmlstring, " ", ""); Htmlstring = Htmlstring.Substring(Htmlstring.IndexOf(" ") + 1); if (Htmlstring.LastIndexOf("'") >= 0) Htmlstring = Htmlstring.Substring(Htmlstring.LastIndexOf("'") + 1); if (Htmlstring.IndexOf("class='tdbk'") >= 0) Htmlstring = Htmlstring.Substring(Htmlstring.IndexOf("class='tdbk'") + "class='tdbk'".Length); return Htmlstring; }



  • 相关阅读:
    Java-使用IO流对大文件进行分割和分割后的合并
    Java-单向链表算法
    Java-二分查找算法
    Java-二叉树算法
    Java-对象比较器
    Android中Activity的四种开发模式
    Struts2工作原理
    C++实现单例模式
    数组中有一个数字出现的次数超过数组的一半,请找出这个数字
    c++ enum用法【转】
  • 原文地址:https://www.cnblogs.com/hrhguanli/p/5058231.html
Copyright © 2011-2022 走看看