这几天发现一个很好的图片网站,美女特多
随便在网上查了查,就下载了一个名叫WebSpider的蜘蛛程序。我仔细研究了一下,感觉作者也是写着玩来着,意思意思,网页下载下来
基本就丢了,另外结构上感觉不太满意,所以改改了。
我大致想的采用双线程,一个UI,一个工作线程
1
public bool Process( WebPageState state )
2
{
3
state.ProcessStarted = true;
4
state.ProcessSuccessfull = false;
5
6
if(level==1)
7
m_baseUri = state.Uri;
8
try
9
{
10
Console.WriteLine( "Process Uri: {0}", state.Uri.AbsoluteUri );
11
12
WebRequest req = WebRequest.Create( state.Uri );
13
WebResponse res = null;
14
15
try
16
{
17
res = req.GetResponse( );
18
19
if ( res is HttpWebResponse )
20
{
21
state.StatusCode = ((HttpWebResponse)res).StatusCode.ToString( );
22
state.StatusDescription = ((HttpWebResponse)res).StatusDescription;
23
}
24
if ( res is FileWebResponse )
25
{
26
state.StatusCode = "OK";
27
state.StatusDescription = "OK";
28
}
29
30
if ( state.StatusCode.Equals( "OK" ) )
31
{
32
StreamReader sr = new StreamReader( res.GetResponseStream( ) );
33
34
state.Content = sr.ReadToEnd( );
35
36
37
MatchCollection m = RegExUtil.GetMatchRegEx(RegularExpression.SrcExtractor, state.Content, true);
38
string Address;
39
int k=0;
40
for (k = 0; k < m.Count;k++)
41
{
42
43
44
Address = m[k].Groups[1].ToString();
45
Uri uri = new Uri(state.Uri, m[k].Groups["url"].ToString());
46
// statusBar.Text = "Address: " + Address;
47
if (!m_pages.Contains(uri.AbsoluteUri))
48
{
49
m_pages.Add(uri.AbsoluteUri);
50
DownloadImage(state.Uri, Address);
51
if (this.ContentHandler != null)
52
{
53
state.mes.MaxProgress = m.Count;
54
55
state.mes.Progress = k+1;
56
state.mes.Result = state.Uri.AbsoluteUri;
57
state.mes.Status = TaskStatus.Running;
58
state.mes.Message = "当前共有图片下载数"+m.Count+" 现在正在下载第"+state.mes.Progress.ToString()+"图片" + Address;
59
ContentHandler.Invoke(state);
60
}
61
}
62
63
64
}
65
66
int counter = 0;
67
Match mm= RegExUtil.GetMatchRegEx(RegularExpression.UrlExtractor, state.Content);
68
69
while (mm.Success)
70
{
71
Uri uri = new Uri(state.Uri, mm.Groups["url"].ToString());
72
if (ValidPage(uri) && !m_pages.Contains(uri.AbsoluteUri))
73
{
74
if (level > 10)
75
return true;
76
counter++;
77
level++;
78
WebPageState statec = new WebPageState(uri);
79
m_pages.Add(uri.AbsoluteUri);
80
Process(statec);
81
}
82
83
84
mm = mm.NextMatch();
85
}
86
87
}
88
89
state.ProcessSuccessfull = true;
90
}
91
catch( Exception ex )
92
{
93
HandleException( ex, state );
94
}
95
finally
96
{
97
if ( res != null )
98
{
99
res.Close( );
100
}
101
}
102
}
103
catch (Exception ex)
104
{
105
Console.WriteLine( ex.ToString( ) );
106
}
107
Console.WriteLine( "Successfull: {0}", state.ProcessSuccessfull );
108
109
return state.ProcessSuccessfull;
110
}
111
#endregion
112
113
114
private void DownloadImage(Uri m_bb,string imgUri)
115
{
116
Uri imageUri = null;
117
string ext = null;
118
string outFile = null;
119
120
121
try
122
{
123
imageUri = new Uri(m_bb, imgUri);
124
125
ext = StrUtil.RightLastIndexOf(imageUri.AbsoluteUri, ".").ToLower();
126
outFile = "temp\\img" + (m_fileId++) + "." + ext;
127
128
if ("jpg|jpeg|swf".IndexOf(ext) > -1)
129
{
130
WebClient web = new WebClient();
131
web.DownloadFile(imageUri.AbsoluteUri,outFile);
132
// byte[] image=web.DownloadData(imageUri);
133
134
if (ext == "swf")
135
{
136
//m_graphicViewerWriter.WriteLine("<object classid='clsid:D27CDB6E-AE6D-11cf-96B8-444553540000' codebase='http://download.macromedia.com/pub/shockwave/cabs/flash/swflash.cab#version=6,0,29,0' width='150' height='100'>");
137
//m_graphicViewerWriter.WriteLine("<param name='movie' value='" + outFile + "'>");
138
//m_graphicViewerWriter.WriteLine("<param name=quality value=high>");
139
//m_graphicViewerWriter.WriteLine("<embed src='" + outFile + "' quality=high pluginspage='http://www.macromedia.com/shockwave/download/index.cgi?P1_Prod_Version=ShockwaveFlash' type='application/x-shockwave-flash' width='150' height='100'></embed>");
140
//m_graphicViewerWriter.WriteLine("</object>");
141
}
142
else
143
{
144
// m_graphicViewerWriter.WriteLine( "<img src='file://" + outFile + "' /><br />");
145
//img" + ( m_fileId++ ) + "." + ext;
146
//m_graphicViewerWriter.WriteLine("<img src='img" + (m_fileId - 1) + "." + ext + "' /><br />");
147
}
148
}
149
}
150
catch (Exception)
151
{
152
// m_graphicViewerWriter.WriteLine("could not download img: " + imageUri.AbsoluteUri);
153
}
154
}
现在基本可以下载图片了,不过感觉要优化的地方较多! 递归的层级暂时没有控制,性能也是一般,代码的结构还是比较乱,后续再重构了!
public bool Process( WebPageState state )2
{3
state.ProcessStarted = true;4
state.ProcessSuccessfull = false;5

6
if(level==1)7
m_baseUri = state.Uri;8
try9
{10
Console.WriteLine( "Process Uri: {0}", state.Uri.AbsoluteUri );11

12
WebRequest req = WebRequest.Create( state.Uri );13
WebResponse res = null;14

15
try16
{17
res = req.GetResponse( );18

19
if ( res is HttpWebResponse )20
{21
state.StatusCode = ((HttpWebResponse)res).StatusCode.ToString( );22
state.StatusDescription = ((HttpWebResponse)res).StatusDescription;23
}24
if ( res is FileWebResponse )25
{26
state.StatusCode = "OK";27
state.StatusDescription = "OK";28
}29

30
if ( state.StatusCode.Equals( "OK" ) )31
{32
StreamReader sr = new StreamReader( res.GetResponseStream( ) );33
34
state.Content = sr.ReadToEnd( );35

36

37
MatchCollection m = RegExUtil.GetMatchRegEx(RegularExpression.SrcExtractor, state.Content, true);38
string Address;39
int k=0;40
for (k = 0; k < m.Count;k++)41
{42

43

44
Address = m[k].Groups[1].ToString();45
Uri uri = new Uri(state.Uri, m[k].Groups["url"].ToString());46
// statusBar.Text = "Address: " + Address;47
if (!m_pages.Contains(uri.AbsoluteUri))48
{49
m_pages.Add(uri.AbsoluteUri);50
DownloadImage(state.Uri, Address);51
if (this.ContentHandler != null)52
{53
state.mes.MaxProgress = m.Count;54
55
state.mes.Progress = k+1;56
state.mes.Result = state.Uri.AbsoluteUri;57
state.mes.Status = TaskStatus.Running;58
state.mes.Message = "当前共有图片下载数"+m.Count+" 现在正在下载第"+state.mes.Progress.ToString()+"图片" + Address;59
ContentHandler.Invoke(state);60
}61
}62

63
64
}65
66
int counter = 0;67
Match mm= RegExUtil.GetMatchRegEx(RegularExpression.UrlExtractor, state.Content);68

69
while (mm.Success)70
{71
Uri uri = new Uri(state.Uri, mm.Groups["url"].ToString());72
if (ValidPage(uri) && !m_pages.Contains(uri.AbsoluteUri))73
{74
if (level > 10)75
return true;76
counter++;77
level++;78
WebPageState statec = new WebPageState(uri);79
m_pages.Add(uri.AbsoluteUri);80
Process(statec);81
}82

83

84
mm = mm.NextMatch();85
}86
87
}88

89
state.ProcessSuccessfull = true;90
}91
catch( Exception ex )92
{93
HandleException( ex, state );94
}95
finally96
{97
if ( res != null )98
{99
res.Close( );100
}101
}102
}103
catch (Exception ex)104
{105
Console.WriteLine( ex.ToString( ) );106
}107
Console.WriteLine( "Successfull: {0}", state.ProcessSuccessfull );108

109
return state.ProcessSuccessfull;110
}111
#endregion112

113

114
private void DownloadImage(Uri m_bb,string imgUri)115
{116
Uri imageUri = null;117
string ext = null;118
string outFile = null;119

120

121
try122
{123
imageUri = new Uri(m_bb, imgUri);124

125
ext = StrUtil.RightLastIndexOf(imageUri.AbsoluteUri, ".").ToLower();126
outFile = "temp\\img" + (m_fileId++) + "." + ext;127

128
if ("jpg|jpeg|swf".IndexOf(ext) > -1)129
{130
WebClient web = new WebClient();131
web.DownloadFile(imageUri.AbsoluteUri,outFile);132
// byte[] image=web.DownloadData(imageUri);133
134
if (ext == "swf")135
{136
//m_graphicViewerWriter.WriteLine("<object classid='clsid:D27CDB6E-AE6D-11cf-96B8-444553540000' codebase='http://download.macromedia.com/pub/shockwave/cabs/flash/swflash.cab#version=6,0,29,0' width='150' height='100'>");137
//m_graphicViewerWriter.WriteLine("<param name='movie' value='" + outFile + "'>");138
//m_graphicViewerWriter.WriteLine("<param name=quality value=high>");139
//m_graphicViewerWriter.WriteLine("<embed src='" + outFile + "' quality=high pluginspage='http://www.macromedia.com/shockwave/download/index.cgi?P1_Prod_Version=ShockwaveFlash' type='application/x-shockwave-flash' width='150' height='100'></embed>");140
//m_graphicViewerWriter.WriteLine("</object>");141
}142
else143
{144
// m_graphicViewerWriter.WriteLine( "<img src='file://" + outFile + "' /><br />");145
//img" + ( m_fileId++ ) + "." + ext;146
//m_graphicViewerWriter.WriteLine("<img src='img" + (m_fileId - 1) + "." + ext + "' /><br />");147
}148
}149
}150
catch (Exception)151
{152
// m_graphicViewerWriter.WriteLine("could not download img: " + imageUri.AbsoluteUri);153
}154
}
