一、用ASP.NET实现正则抓取匹配 www.163.com 首页的财经专栏
要求:提取出连接和标题
时间:1小时内
注:首先,需要动态获取http://www.163.com 的内容,接着获取到网易的首页内容后,用正则进行匹配.得到想要的内容.,把标题和链接保存为数组,
我的完成:
代码
using System;
using System.Configuration;
using System.Data;
using System.Linq;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.HtmlControls;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Xml.Linq;
using System.Net;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
public partial class _Default : System.Web.UI.Page
{
protected void Page_Load(object sender, EventArgs e)
{
}
protected void btn163_Click(object sender, EventArgs e)
{
string str = GetSiteContent();
int index = str.IndexOf("<a href=\"http://money.163.com/blog/\">财经专栏</a>");
str = str.Substring(index);
index = str.IndexOf("健康专栏");
str = str.Substring(0, index);
Regex re = new Regex(@"<a[\s]+href=(?<Link>[^\s>]+)[^>]*>(?<Text>[^<]*)</a>");
MatchCollection mc = re.Matches(str);
string[] strList = new string[mc.Count];
StringBuilder sb = new StringBuilder();
for (int n = 0; n < mc.Count; n++)
{
strList[n] = mc[n].Groups[0].ToString();
sb.AppendFormat("{0}</br>", strList[n]);
}
this.Label1.Text = sb.ToString();
}
private string GetSiteContent()
{
WebRequest request = WebRequest.Create("http://www.163.com");
WebResponse response = request.GetResponse();
Stream stream = response.GetResponseStream();
StreamReader reader = new StreamReader(stream, Encoding.Default);
string str = reader.ReadToEnd();
reader.Close();
stream.Close();
response.Close();
return str;
}
}
二、有一个文本,内容为
http://www.test.com/info.html
http://www.test.com/info3.html
http://www.test.com/info2.html
http://www.test.com/info1.html
http://www.test.com/info2.html
http://www.test.com/info1.html
要求结果为去除重复项,按出现次数进行排序
我的完成:
代码
using System;
using System.Collections;
using System.Configuration;
using System.Data;
using System.Linq;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.HtmlControls;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Xml.Linq;
using System.IO;
using System.Text;
namespace WebApplication1
{
public partial class _Default : System.Web.UI.Page
{
protected void Page_Load(object sender, EventArgs e)
{
}
protected void Button1_Click(object sender, EventArgs e)
{
string[] strList1 = GetStringList();
var tt = strList1.Distinct();
string[] strList2 = tt.ToArray();
int[] num = new int[strList2.Length];
for (int m = 0; m < tt.Count(); m++)
{
num[m] = strList1.Count((s) => s == strList2[m]);
}
int t1, t2;
string str1, str2;
for(int n=0;n<num.Length;n++)
{
for (int t = 0; t < num.Length-1; t++)
{
t1 = num[t];
t2 = num[t + 1];
if (t2 > t1)
{
num[t] = t2;
num[t + 1] = t1;
str1 = strList2[t];
str2 = strList2[t+1];
strList2[t] = str2;
strList2[t + 1] = str1;
}
}
}
strList1.ToString();
}
private string[] GetStringList()
{
StreamReader reader = new StreamReader(this.File1.PostedFile.InputStream, Encoding.Default);
string str = reader.ReadToEnd();
reader.Close();
string[] strList = str.Split("\r\n".ToCharArray());
ArrayList list = new ArrayList();
foreach (string s in strList)
{
if (s.Length > 0)
{
list.Add(s);
}
}
string[] strList2 = new string[list.Count];
for (int t = 0; t < list.Count; t++)
{
strList2[t] = list[t].ToString();
} return strList2;
}
}
}