private void button1_Click(object sender, EventArgs e) { string s1 = this.textBox1.Text; //正则表达式内容 //string match = @"^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+&%\$#\=~])*$"; //string match = @"[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+&%\$#\=~])*$"; string match = @"[a-zA-z]+://[^\s]*"; //初始化正则表达式实例 Regex reg = new Regex(match); //开始验证 bool HasValidate = reg.IsMatch(s1); if (HasValidate) { //MessageBox.Show("这是网站有效URL格式。"); try { string tmp = GetHtml(s1); string tmpend = StripHTML(tmp); } catch (Exception) { //MessageBox.Show("3.该网站只能手动查询!"); } } }
1.获取HTML
GetHtml(String Url)
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
/// <summary> /// 获取有效的HTML /// </summary> /// <param name="Url"></param> /// <returns></returns> public String GetHtml(String Url) { string sException = null; string sRslt = null; string GBsRslt = null; StreamReader htm = null; WebResponse oWebRps = null; WebResponse bWebRps = null; int a = 0; WebRequest oWebRqst = WebRequest.Create(Url); oWebRqst.Timeout = 50000; WebRequest bWebRqst = WebRequest.Create(Url); bWebRqst.Timeout = 50000; try { oWebRps = oWebRqst.GetResponse(); bWebRps = bWebRqst.GetResponse(); } catch (WebException e) { sException = e.Message.ToString(); MessageBox.Show(sException); } catch (Exception e) { sException = e.ToString(); MessageBox.Show(sException); } finally { if (oWebRps != null) { StreamReader oStreamRd = new StreamReader( oWebRps.GetResponseStream(), Encoding.GetEncoding("UTF-8") ); StreamReader GBoStreamRd = new StreamReader( bWebRps.GetResponseStream(), Encoding.GetEncoding("GB2312") ); sRslt = oStreamRd.ReadToEnd(); GBsRslt = GBoStreamRd.ReadToEnd(); if (!isLuan(sRslt)) //判断utf8是否有乱码 { htm = oStreamRd; } else { htm = GBoStreamRd; } if (htm == oStreamRd) { a = 1; } else { a = 2; } oStreamRd.Close(); GBoStreamRd.Close(); oWebRps.Close(); } } if (a == 1) { return sRslt; } else { return GBsRslt; } }
2.去除HTML标记(正则表达式)
StripHTML(string strHtml)
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
1 /// <summary> 2 /// 去除HTML标记 3 /// </summary> 4 /// <param name="strHtml">包括HTML的源码 </param> 5 /// <returns>已经去除后的文字</returns> 6 public static string StripHTML(string strHtml) 7 { 8 //regex_str="<script type=\\s*[^>]*>[^<]*?</script>";//替换<script>内容</script>为空格 9 string regex_str = "(?is)<script[^>]*>.*?</script>";//替换<script>内容</script>为空格 10 strHtml = Regex.Replace(strHtml, regex_str, ""); 11 12 //regex_str="<script type=\\s*[^>]*>[^<]*?</script>";//替换<style>内容</style>为空格 13 regex_str = "(?is)<style[^>]*>.*?</style>";//替换<style>内容</style>为空格 14 strHtml = Regex.Replace(strHtml, regex_str, ""); 15 16 //regex_str = "( )+";//替换 为空格 17 regex_str = "(?i) ";//替换 为空格 18 strHtml = Regex.Replace(strHtml, regex_str, " "); 19 20 //regex_str = "(\r\n)*";//替换\r\n为空 21 regex_str = @"[\r\n]*";//替换\r\n为空 22 strHtml = Regex.Replace(strHtml, regex_str, "", RegexOptions.IgnoreCase); 23 24 //regex_str = "<[^<]*>";//替换Html标签为空 25 regex_str = "<[^<>]*>";//替换Html标签为空 26 strHtml = Regex.Replace(strHtml, regex_str, ""); 27 28 //regex_str = "\n*";//替换\n为空 29 regex_str = @"\n*";//替换\n为空 30 strHtml = Regex.Replace(strHtml, regex_str, "", RegexOptions.IgnoreCase); 31 32 //可以这样 33 regex_str = "\t*";//替换\t为空 34 strHtml = Regex.Replace(strHtml, regex_str, "", RegexOptions.IgnoreCase); 35 36 //可以 37 regex_str = "'";//替换'为’ 38 strHtml = Regex.Replace(strHtml, regex_str, "’", RegexOptions.IgnoreCase); 39 40 //可以 41 regex_str = " +";//替换若干个空格为一个空格 42 strHtml = Regex.Replace(strHtml, regex_str, " ", RegexOptions.IgnoreCase); 43 44 Regex regex = new Regex("<.+?>", RegexOptions.IgnoreCase); 45 46 string strOutput = regex.Replace(strHtml, "");//替换掉"<"和">"之间的内容 47 strOutput = strOutput.Replace("<", ""); 48 strOutput = strOutput.Replace(">", ""); 49 strOutput = strOutput.Replace(" ", ""); 50 51 52 return strOutput; 53 54 }
3.判断是否为乱码(编码):在StripHTML里调用。
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
//判断是否为乱码 bool isLuan(string txt) { var bytes = Encoding.UTF8.GetBytes(txt); //239 191 189 for (var i = 0; i < bytes.Length; i++) { if (i < bytes.Length - 3) if (bytes[i] == 239 && bytes[i + 1] == 191 && bytes[i + 2] == 189) { return true; } } return false; }