using System;
using System.Data;
using System.Configuration;
using System.Net;
using System.IO;
using System.Text;
using System.Collections.Generic;
using System.Text.RegularExpressions;
using System.Threading;
using System.Web;
namespace Common
{
public class Utils
{
/// <summary>
/// 这私有方法从网页的HTML代码中分析出链接信息
/// </summary>
/// <returns>List<Link></returns>
public static List<string> getLinks(string html)
{
List<string> m_links=new List<string>();
Uri m_uri = new Uri("http://www.suning.com/") ;//网址
if (m_links.Count == 0)
{
Regex[] regex = new Regex[2];
regex[0] = new Regex("(?m)<a[^><]+href=(\"|')?(?<url>([^>\"'\\s)])+)(\"|')?[^>]*>(?<text>(file:///w://W)*?)</", RegexOptions.Multiline | RegexOptions.IgnoreCase);
Match match = regex[0].Match(html);
while (match.Success)
{
try
{
string url = new Uri(m_uri, match.Groups["url"].Value).AbsoluteUri;
m_links.Add(url);
}
catch(Exception ex)
{
Console.WriteLine(ex.Message);
};
match = match.NextMatch();
}
}
return m_links;
}
public static string GetHtml(string url)
{
System.Net.WebClient wc = new System.Net.WebClient();
System.IO.Stream sm = wc.OpenRead(url);
System.IO.StreamReader sr = new System.IO.StreamReader(sm, System.Text.Encoding.Default, true, 256000);
string html = sr.ReadToEnd();
sr.Close();
return html;
}
/// <summary>
/// 获得基础流
/// </summary>
/// <param name="uri">网址</param>
/// <param name="cc">cookie容器,可以为NULL</param>
/// <returns></returns>
public static Stream GetBaseStream(string uri, CookieContainer cc)
{
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(uri); //用指定Uri创建一个request
if (cc != null)
{
request.CookieContainer = cc;
}
//浏览器欺骗
request.ContentType = "application/x-www-form-urlencoded";
request.Accept = @"application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5";
request.UserAgent = @"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/533.2 (KHTML, like Gecko) Chrome/5.0.342.9 Safari/533.2 ChromePlus/1.3.9.0";
HttpWebResponse response = (HttpWebResponse)request.GetResponse(); //根据创建的request得到响应response
Stream responseStream = response.GetResponseStream(); //创建一个流来获得响应体
return responseStream;
}
catch (Exception ex)
{
//MessageBox.Show(@"操作失败:" + ex.Message);
return null;
}
}
/// <summary>
/// 获得网页
/// </summary>
/// <param name="uri">网址</param>
/// <param name="postDate"></param>
/// <param name="cc">cookie容器,可以为null</param>
/// <param name="encoding">网页编码</param>
/// <returns></returns>
public static string GetHtmlString(string uri, string postDate, CookieContainer cc, Encoding encoding)
{
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(uri);
request.ContentType = "application/x-www-form-urlencoded";
request.AllowAutoRedirect = true;
request.Accept = "application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5";
request.UserAgent = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/533.2 (KHTML, like Gecko) Chrome/5.0.342.9 Safari/533.2 ChromePlus/1.3.9.0";
request.CookieContainer = cc; //设置request产生cookie的容器
if (postDate != null)
{
request.Method = "Post";
byte[] byterequest = Encoding.UTF8.GetBytes(postDate);
request.ContentLength = byterequest.Length;
using (Stream stream = request.GetRequestStream())
{
stream.Write(byterequest, 0, byterequest.Length);
}
}
using (HttpWebResponse response = (HttpWebResponse)request.GetResponse())
{
using (Stream responsestream = response.GetResponseStream())
{
StreamReader sr = new StreamReader(responsestream, encoding);
string html = sr.ReadToEnd();
return html;
}
}
}
catch (Exception ex)
{
//MessageBox.Show(@"发生错误:" + ex.Message);
return null;
}
}
/// <summary>
/// 从字符串中返回匹配多个的集合值(网页抽取特定部分有效)
/// </summary>
/// <param name="start">开始html tag</param>
/// <param name="end">结束html tag</param>
/// <param name="html">html</param>
/// <returns></returns>
public static List<string> GetStrings(string html,string start, string end)
{
List<string> list = new List<string>();
try
{
string pattern = string.Format("{0}(?<g>(.|[\r\n])+?){1}", start, end);//匹配URL的模式,并分组 //理解这个正则
MatchCollection mc = Regex.Matches(html, pattern);//满足pattern的匹配集合
if (mc.Count != 0)
{
foreach (Match match in mc)
{
GroupCollection gc = match.Groups;
list.Add(gc["g"].Value);
}
}
}
catch
{ }
return list;
}
}
}