以下是我写的一个从网页中抓取EMail的方法,可以处理带分页的link。用这个程序,我一下子从一个网页中提取到3000多个EMail(哈哈,发垃圾邮件的人是不是也这样做的??)
1
//CAll
2
private void GetAllURL(string urlStr)
3
{
4
new Thread(new ParameterizedThreadStart(GetEmailAddress)).Start(urlStr);
... //处理页面中的Link
}
5
/// <summary>
6
/// 提取网页中的Eamil
7
/// </summary>
8
/// <param name="urlStr">网页地址</param>
9
private void GetEmailAddress(object urlStr)
10
{
11
ArrayList EmailStrs = GetWebInfo((string)urlStr, @"(?<EmailStr>\b[A-Z0-9._%-]+@[A-Z0-9._%-]+\.[A-Z]{2,4}\b)"); //得到Email
12
foreach (object tmp in EmailStrs)
13
{
14
Invoke(new AppendTextDelegate(AppendText), new object[] { tmp + "\r\n" });
15
}
16
}
17
18
private ArrayList GetWebInfo(string URlStr,string RegExpress)
19
{
20
//打开指定页
21
HttpWebRequest webRequest1 = (HttpWebRequest)WebRequest.Create(new Uri(URlStr));
22
webRequest1.Method = "GET";
23
HttpWebResponse response = (HttpWebResponse)webRequest1.GetResponse();
24
String textData = new StreamReader(response.GetResponseStream(), Encoding.Default).ReadToEnd();
25
26
27
//用正则表达式,提取指定内容,带一个变量
28
Regex r;
29
Match m;
30
r = new Regex(RegExpress, //@"copyTitle.\'(?<AdInfo>.*)\'",
31
RegexOptions.IgnoreCase | RegexOptions.Compiled);
32
int pos1=RegExpress.IndexOf("(?<");
33
int pos2=RegExpress.IndexOf(">",pos1);
34
string DestionKey = RegExpress.Substring(pos1 + 3, pos2 - pos1 - 3);
35
string AdStr = "";
36
ArrayList Result = new ArrayList();
37
for (m = r.Match(textData); m.Success; m = m.NextMatch())
38
{
39
AdStr = m.Result("${" + DestionKey + "}").Trim(); //地址
40
Result.Add(AdStr);
41
}
42
return Result;
43
}
44

2

3

4

... //处理页面中的Link
}
5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

上述代码中的关键是书写提取EMail的表达式:
@"(?<EmailStr>\b[A-Z0-9._%-]+@[A-Z0-9._%-]+\.[A-Z]{2,4}\b)"
以下是我写的一个程序界面及运行结果: