zoukankan      html  css  js  c++  java
  • Asp.net 使用正则和网络编程抓取网页数据(有用)

    Asp.net 使用正则和网络编程抓取网页数据(有用)

    Asp.net 使用正则和网络编程抓取网页数据(有用)

            /// <summary>
            /// 抓取网页对应内容
            /// </summary>
            /// <param name="strUrl">採集地址</param>
            /// <param name="Begin">開始字符</param>
            /// <param name="End">结束字符</param>
            /// <returns></returns>
            private static String GetContent(String strUrl, String Begin, String End)
            {
                String result = String.Empty;
                HttpWebRequest request = (HttpWebRequest)WebRequest.Create(strUrl);
                HttpWebResponse response = (HttpWebResponse)request.GetResponse();
                using (StreamReader reader = new StreamReader(response.GetResponseStream(), System.Text.Encoding.Default))
                {
                    result = reader.ReadToEnd();
                    
                    reader.Close();
                    response.Close();
                }
    
                //抓取内容
                Match table = Regex.Match(result, "(?<=" + Begin + ")[\s\S]*?

    (?

    =" + End + ")", RegexOptions.IgnoreCase); result = NoHTML(table.Value); return result; } ///<summary> ///去除HTML标记 ///</summary> ///<param name="NoHTML">包含HTML的源代码 </param> ///<returns>已经去除后的文字</returns> private static string NoHTML(string Htmlstring) { //删除脚本 Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?

    >.*?

    </script>", "", RegexOptions.IgnoreCase); //删除HTML Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"([ ])[s]+", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", """, RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "xa1", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "xa2", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "xa3", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "xa9", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&#(d+);", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, ">", ""); Htmlstring = Regex.Replace(Htmlstring, "<", ""); Htmlstring = Regex.Replace(Htmlstring, " ", ""); Htmlstring = Htmlstring.Substring(Htmlstring.IndexOf(" ") + 1); if (Htmlstring.LastIndexOf("'") >= 0) Htmlstring = Htmlstring.Substring(Htmlstring.LastIndexOf("'") + 1); if (Htmlstring.IndexOf("class='tdbk'") >= 0) Htmlstring = Htmlstring.Substring(Htmlstring.IndexOf("class='tdbk'") + "class='tdbk'".Length); return Htmlstring; }



  • 相关阅读:
    尝试用Gearman实现分布式处理(PHP)
    如何在ubuntu上安装node.js
    Linux crontab定时执行任务 命令格式与详细例子
    MySQL的information_schema
    Ubuntu安装MongoDB
    Log4j日志管理系统简单使用说明
    android:screenOrientation横屏竖屏设置
    Android 自定义progressBar样式
    Java使用JDOM解析XML(转载,简单又详细)
    Caused by: android.util.AndroidRuntimeException: Calling startActivity() from outside of an Activity
  • 原文地址:https://www.cnblogs.com/hrhguanli/p/5058231.html
Copyright © 2011-2022 走看看