下面是一个获取HTML也一段代码
string str = "http://www.sooboo.com.cn/Services/NewsList.aspx";
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(str);
request.Method = "Get";
request.ContentType = "application/x-www-form-urlencoded";
WebResponse response = request.GetResponse();
Stream s = response.GetResponseStream();
StreamReader sr = new StreamReader(s, System.Text.Encoding.GetEncoding("utf-8"));
string html = sr.ReadToEnd();
s.Close();
sr.Close();
Response.Write(htm);
以上这部分没问题了成功,哈哈HttpWebRequest request = (HttpWebRequest)WebRequest.Create(str);
request.Method = "Get";
request.ContentType = "application/x-www-form-urlencoded";
WebResponse response = request.GetResponse();
Stream s = response.GetResponseStream();
StreamReader sr = new StreamReader(s, System.Text.Encoding.GetEncoding("utf-8"));
string html = sr.ReadToEnd();
s.Close();
sr.Close();
Response.Write(htm);
我们在说下一步
分析代码,代码我们是获取到了,但是我们怎么样才能提取到我们想要的东西呢, 例如,新闻标题,URL,时间等
在看我下面的一段代码
public string[] GetData(string Html)
{
String[] rS = new String[2];为什么这里要2位的数组
string s = Html;
s = Regex.Replace(s, "\\s{3,}", "");
s = s.Replace("\r", "");
s = s.Replace("\n", "");
string Pat = "<td align=\"center\" class=\"24p\"><B>(.*)</B></td></tr><tr>.*(<table width=\"95%\" border=\"0\" cellspacing=\"0\" cellpadding=\"10\">.*</table>)<table width=\"98%\" border=\"0\" cellspacing=\"0\" cellpadding=\"0\">(.*)<td align=center class=l6h>";
string pt = "<a href='(.*)'";
Regex Re = new Regex(pt);
Match Ma = Re.Match(s);//我想实现以下功能,第一获取网站左右的URL和连接标题, 我自己写了几个表达式,但是都得不到满意的效果,大家帮忙想想
if (Ma.Success)
{
rS[0] = Ma.Groups[1].ToString();
rS[1] = Ma.Groups[2].ToString();
string pgStr = Ma.Groups[3].ToString();
}
Response.Write(Ma.Groups.Count);
Response.Write(rS[0].ToString());
Response.Write(rS[1].ToString());
return rS;
}
现在的问题就出现了,第一怎么样获取文章标题和连接啊,主要也就是对HTML的分析,高手们一起讨论一下吧,互相学习{
String[] rS = new String[2];为什么这里要2位的数组
string s = Html;
s = Regex.Replace(s, "\\s{3,}", "");
s = s.Replace("\r", "");
s = s.Replace("\n", "");
string Pat = "<td align=\"center\" class=\"24p\"><B>(.*)</B></td></tr><tr>.*(<table width=\"95%\" border=\"0\" cellspacing=\"0\" cellpadding=\"10\">.*</table>)<table width=\"98%\" border=\"0\" cellspacing=\"0\" cellpadding=\"0\">(.*)<td align=center class=l6h>";
string pt = "<a href='(.*)'";
Regex Re = new Regex(pt);
Match Ma = Re.Match(s);//我想实现以下功能,第一获取网站左右的URL和连接标题, 我自己写了几个表达式,但是都得不到满意的效果,大家帮忙想想
if (Ma.Success)
{
rS[0] = Ma.Groups[1].ToString();
rS[1] = Ma.Groups[2].ToString();
string pgStr = Ma.Groups[3].ToString();
}
Response.Write(Ma.Groups.Count);
Response.Write(rS[0].ToString());
Response.Write(rS[1].ToString());
return rS;
}
就用http://www.sooboo.com.cn/Services/NewsList.aspx这个地址来分析吧