zoukankan html css js c++ java

一个基于Android系统的新闻客户端(二)

　　　　　　三、抓取消息标题

　　　　　　　　网络爬虫很复杂很复杂，这里不能做，所以这里只做抓取网页。

　　　　　　　　C#中有一个类叫WebClient,这个类的成员方法可以下载指定网页的html代码，用法为：

　　　　　　　　WebClient wl=new WebClient();

　　　　　　　　Stream sm=wl.OpenRead("http://xxxxxx");

　　　　　　　　StreamReader sr=new StreamReader(sm);

　　　　　　　　string str=String.Empty;

　　　　　　　　string ch=String.Empty;

　　　　　　　　while((ch=sm.ReadLine())!=NULL)

　　　　　　　　{

　　　　　　　　　　　str +=ch;

　　　　　　　　}

　　　　　　　　新建一个类库项目：Crawler。

　　　　　　　　添加类CrawlerMain。

　　　　　　　　代码为：

using System;
using System.Collections.Generic;
using System.Configuration;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Threading.Tasks;

namespace Crawler
{
    public class CrawlerMain
    {
        private WebClient wc;
        private string New;
        public CrawlerMain()
        {
            wc = new WebClient();
            New = String.Empty;
        }

        
        private async void ReadWeb()
        {
            New=await Main_ReadWebAsync(); 
        }

        private Task<string> Main_ReadWebAsync()
        {
            return Task.Run<string>(() =>
            {
                return this.Main_ReadWeb();
            });
        }

        public string Main_ReadWeb()
        {
            Stream sm = wc.OpenRead(ConfigurationManager.ConnectionStrings["Ardess"].ToString());
            StreamReader sr = new StreamReader(sm);
            string liner = String.Empty;
            string ch = String.Empty;
            while ((ch = sr.ReadLine()) != null)
            {
                liner += ch;
            }
            return liner;
        }

        public string GetNew()
        {
            if (String.IsNullOrEmpty(New))
            {
                this.ReadWeb();
            }
            return New;
        }

    }
}

View Code

　　　　　　　　新建一个控制台程序

　　　　　　　　　代码为：　　　　　

using Crawler;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace Test
{
    class Program
    {
        static void Main(string[] args)
        {
            string str = new CrawlerMain().Main_ReadWeb();
            //str = "sadsadsadsd<body>adasdasdasdsa</body>sadasdasdasd";
            string str_1 = str.Substring(str.LastIndexOf("box_01"), str.LastIndexOf("box_02") - str.LastIndexOf("box_01"));
            string[] strNew = str_1.Split(new char[]{'a'});
            bool j = false;
            foreach(string s in strNew)
            {
                Console.Write(s+"
");
                string[] ss = s.Split('=');
                if (ss.Length > 1)
                {
                    string[] sss = ss[1].Split('"');
                    if (j)
                    {
                        File.AppendAllText("t.txt", sss[1] + "
");
                        j = false;
                    }
                    if (sss[1] == "_bl")
                    {
                        j = true;
                    }
                   
                }
            }
            
   
            Console.Read();
        }
    }
}

View Code

　　　　　　　　好了就这样吧！

查看全文

相关阅读:
PRCT-1302 the OCR has an invalid ip address
函数listen
函数bind
函数socket
lamp。查看版本
 yii 日期插件
 UCenter 的目录结构
 API接口
 返回标签数据示例 (PHP)
应用接口函数

原文地址：https://www.cnblogs.com/liguifa/p/3801508.html