一个基于Android系统的新闻客户端(二)
三、抓取消息标题
网络爬虫很复杂很复杂,这里不能做,所以这里只做抓取网页。
C#中有一个类叫WebClient,这个类的成员方法可以下载指定网页的html代码,用法为:
WebClient wl=new WebClient();
Stream sm=wl.OpenRead("http://xxxxxx");
StreamReader sr=new StreamReader(sm);
string str=String.Empty;
string ch=String.Empty;
while((ch=sm.ReadLine())!=NULL)
{
str +=ch;
}
新建一个类库项目:Crawler。
添加类CrawlerMain。
代码为:
using System; using System.Collections.Generic; using System.Configuration; using System.IO; using System.Linq; using System.Net; using System.Text; using System.Threading.Tasks; namespace Crawler { public class CrawlerMain { private WebClient wc; private string New; public CrawlerMain() { wc = new WebClient(); New = String.Empty; } private async void ReadWeb() { New=await Main_ReadWebAsync(); } private Task<string> Main_ReadWebAsync() { return Task.Run<string>(() => { return this.Main_ReadWeb(); }); } public string Main_ReadWeb() { Stream sm = wc.OpenRead(ConfigurationManager.ConnectionStrings["Ardess"].ToString()); StreamReader sr = new StreamReader(sm); string liner = String.Empty; string ch = String.Empty; while ((ch = sr.ReadLine()) != null) { liner += ch; } return liner; } public string GetNew() { if (String.IsNullOrEmpty(New)) { this.ReadWeb(); } return New; } } }
新建一个控制台程序
代码为:
using Crawler; using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text; using System.Threading.Tasks; namespace Test { class Program { static void Main(string[] args) { string str = new CrawlerMain().Main_ReadWeb(); //str = "sadsadsadsd<body>adasdasdasdsa</body>sadasdasdasd"; string str_1 = str.Substring(str.LastIndexOf("box_01"), str.LastIndexOf("box_02") - str.LastIndexOf("box_01")); string[] strNew = str_1.Split(new char[]{'a'}); bool j = false; foreach(string s in strNew) { Console.Write(s+" "); string[] ss = s.Split('='); if (ss.Length > 1) { string[] sss = ss[1].Split('"'); if (j) { File.AppendAllText("t.txt", sss[1] + " "); j = false; } if (sss[1] == "_bl") { j = true; } } } Console.Read(); } } }
好了就这样吧!