zoukankan      html  css  js  c++  java
  • 点滴积累【C#】---抓取页面中想要的数据

    效果

    描述:此功能是抓取外国的一个检测PM2.5的网站。实时读取网站的数据,然后保存到数据库里面。每隔一小时刷新一次。

    地址为:http://beijing.usembassy-china.org.cn/070109air.html

    筛选后的地址为:http://utils.usembassy.gov/feed2js/feed2js.php?src=http%3A%2F%2Fwww.stateair.net%2Fweb%2Frss%2F1%2F1.xml&desc=1&num=7&targ=y&utf=y&pc=y&words=40&

    思路:先抓取到页面的所有数据,保存到txt里面,再一行一行的读取txt,然后用split,substring截取到自己想要的数据,最后保存到数据库,在进行插入数据库的时候查看一下是否已经存在,如果不存在则插入。

    代码

    using System;
    using System.Collections.Generic;
    using System.Configuration;
    using System.Data;
    using System.Data.SqlClient;
    using System.IO;
    //using System.Linq;
    using System.Net;
    using System.Text;
    using System.Text.RegularExpressions;
    //using System.Threading.Tasks;
    
    /********************************
     * 创建人:青苹果
     * 创建时间:2015-12-28
     * 描述:获取美利坚合众国的 PM2.5
     * ******************************/
    
    namespace GetUSAData
    {
        class Program
        {
            //public static string GetURL = System.Configuration.ConfigurationSettings.AppSettings["GetURL"];//获取数据的地址
            public static string GetURL = "http://utils.usembassy.gov/feed2js/feed2js.php?src=http%3A%2F%2Fwww.stateair.net%2Fweb%2Frss%2F1%2F1.xml&desc=1&num=7&targ=y&utf=y&pc=y&words=40&";
            public static string txtURL = System.Configuration.ConfigurationSettings.AppSettings["txtURL"];//保存为txt文件的路径
            public static string conn = ConfigurationManager.ConnectionStrings["ConnectionString"].ToString();
    
            static void Main(string[] args)
            {
    
                LoadGO();
            }
    
            public static void LoadGO()
            {
                GetUSA();
                List<string[]> getlist = Read(txtURL);
                //删除txt
                if (File.Exists(txtURL))
                {
                    //如果存在则删除
                    File.Delete(txtURL);
                }
                if (getlist.Count > 0)
                {
                    for (int i = getlist.Count-1; i >-1; i--)
                    {
                        DateTime dtime = DateTime.Parse(getlist[i][0].ToString());
                        string getTime = dtime.ToString("yyyy-MM-dd HH:mm");
                        string controlTime = dtime.ToString("yyyy-MM-dd");
                        float LatestHourdata1 = float.Parse(getlist[i][2]);
                        int LatestHourdata2 = Convert.ToInt32(getlist[i][3]);
                        float Avgdata1 = 0;
                        int Avgdata2 = 0;
                        string Avgdata3 = getlist[i][4].ToString();
    
                        List<SqlParameter> listWhere = new List<SqlParameter>();
                        listWhere.Add(new SqlParameter("@strDatetime", controlTime));
                        string sqlSelect = @"SELECT count(*) as allcount,sum(LatestHourdata1) as LatestHourdata1_avg, sum(LatestHourdata2) as LatestHourdata2_avg
     FROM T_twitter  where ([LatestHourdata1] is not null
     or [LatestHourdata2] is not null or [Avgdata1] is not null
      or [AvgData2] is not null) and   CONVERT(varchar(100), [datetime], 23)=@strDatetime";
    
                        DataTable sumDT = ControlDB(sqlSelect, listWhere, "select");    //查询总和用于计算日均值
                        if (sumDT.Rows.Count > 0)
                        {
                            foreach (DataRow itemDR in sumDT.Rows)
                            {
                                int allcount = Convert.ToInt32(itemDR["allcount"].ToString());    //数据库中当前日期数量总和
                                if (allcount > 0)
                                {
                                    if (itemDR["LatestHourdata1_avg"] != null)
                                    {
                                        Avgdata1 = float.Parse(itemDR["LatestHourdata1_avg"].ToString());   //数据库中LatestHourdata1_avg总和
                                        Avgdata1 = (Avgdata1 + LatestHourdata1) / (allcount + 1);//(数据库的总和+最新的一条)/(数据库的总和数量+1)=日平均值
                                    }
                                    if (itemDR["LatestHourdata2_avg"] != null)
                                    {
                                        Avgdata2 = Convert.ToInt32(itemDR["LatestHourdata2_avg"].ToString());   //数据库中LatestHourdata2_avg总和
                                        Avgdata2 = (Avgdata2 + LatestHourdata2) / (allcount + 1);//(数据库的总和+最新的一条)/(数据库的总和数量+1)=日平均值
                                    }
                                    //根据网站规则判断PM2.5的平均严重性
    
                                    if (Avgdata2 >= 0 && Avgdata2 <= 50)
                                    {
                                        Avgdata3 = " Good (at 24-hour exposure at this level)";
                                    }
                                    else if (Avgdata2 >= 51 && Avgdata2 <= 100)
                                    {
                                        Avgdata3 = " Moderate (at 24-hour exposure at this level)";
                                    }
                                    else if (Avgdata2 >= 101 && Avgdata2 <= 150)
                                    {
                                        Avgdata3 = " Unhealthy for Sensitive Groups (at 24-hour exposure at this level)";
                                    }
                                    else if (Avgdata2 >= 151 && Avgdata2 <= 200)
                                    {
                                        Avgdata3 = " Unhealthy (at 24-hour exposure at this level)";
                                    }
                                    else if (Avgdata2 >= 201 && Avgdata2 <= 300)
                                    {
                                        Avgdata3 = " Very Unhealthy (at 24-hour exposure at this level)";
                                    }
                                    else
                                    {
                                        Avgdata3 = " Hazardous (at 24-hour exposure at this level)";
                                    }
                                }
                                else
                                {
                                    Avgdata1 = LatestHourdata1;
                                    Avgdata2 = LatestHourdata2;
                                }
                            }
                        }
    
                        List<SqlParameter> pars = new List<SqlParameter>();
                        pars.Add(new SqlParameter("@whereDatetime", getTime));
                        pars.Add(new SqlParameter("@datetime", getTime));
                        pars.Add(new SqlParameter("@LatestHourdata1", LatestHourdata1));
                        pars.Add(new SqlParameter("@LatestHourdata2", LatestHourdata2));
                        pars.Add(new SqlParameter("@LatestHourdata3", getlist[i][4].ToString()));
                        pars.Add(new SqlParameter("@Avgdata1", Avgdata1));
                        pars.Add(new SqlParameter("@Avgdata2", Avgdata2));
                        pars.Add(new SqlParameter("@Avgdata3", Avgdata3));
    
                        string sql = @"if not exists(select * from  dbo.T_twitter where  datetime=@whereDatetime) begin
    insert T_twitter (datetime,LatestHourdata1,LatestHourdata2,LatestHourdata3,Avgdata1,AvgData2,AvgData3)
    VALUES(@datetime,@LatestHourdata1,@LatestHourdata2,@LatestHourdata3,@Avgdata1,@Avgdata2,@Avgdata3) end";
                        ControlDB(sql, pars, "");//插入数据
                    }
                }
            }
    
            /// <summary>
            /// 获取页面数据保存至txt
            /// </summary>
            public static void GetUSA()
            {
                WebRequest request = WebRequest.Create(GetURL);
                WebResponse response = request.GetResponse();
                StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312"));
                //reader.ReadToEnd() 表示取得网页的源码
    
                FileStream fs = new FileStream(txtURL, FileMode.Create);
                byte[] data = System.Text.Encoding.Default.GetBytes(reader.ReadToEnd());
                //开始写入
                fs.Write(data, 0, data.Length);
                //清空缓冲区、关闭流
                fs.Flush();
                fs.Close();
            }
    
            /// <summary>
            /// 根据路径读取txt文件
            /// </summary>
            /// <param name="path">txt路径</param>
            /// <returns></returns>
            public static List<string[]> Read(string path)
            {
                List<string[]> list = new List<string[]>();
                StreamReader sr = new StreamReader(path, Encoding.Default);
                String line;
                while ((line = sr.ReadLine()) != null)
                {
                    int i = line.ToString().IndexOf("title");
                    if (i > 0)
                    {
                        string titleStr = line.ToString().Substring(i + 7); //截取到title后面的值
                        string[] titlelist = titleStr.Split('"');        //以"  截取
                        string titledata = titlelist[0];
                        string[] datalist = titledata.Split('&');  //以& 截取
                        string data = datalist[0];
                        string[] datastrlist = data.Split(new char[] { ';' }, StringSplitOptions.RemoveEmptyEntries);//以; 截取
                        list.Add(datastrlist);
                    }
                }
                sr.Close();
                return list;
            }
    
            /// <summary>
            /// 增查表
            /// </summary>
            /// <returns></returns>
            public static DataTable ControlDB(string sql, List<SqlParameter> par, string type)
            {
                DataAccess controData = new DataAccess();
                DataTable resultDT = new DataTable();
                if (type == "select")
                {
                    resultDT = controData.GetDataTable(sql, par.ToArray());
                }
                else
                {
                    int result = controData.ExecuteSql(sql, par.ToArray());
                }
                return resultDT;
            }
        }
    }

     Demo下载:

     http://files.cnblogs.com/files/xinchun/GetUSAData.zip

  • 相关阅读:
    IDS与IPS功能分析
    CentOS 命令大全
    仿京东放大镜
    CSS垂直水平居中方法总结
    Java学习之计算机基础(一)
    Java 代码学习之理解数据类型中的坑
    Java 代码学习之数组的初始化
    Java中的比较总结
    Java Random介绍
    手机网站开发必修课[2]:浏览器兼容性测试
  • 原文地址:https://www.cnblogs.com/xinchun/p/5090314.html
Copyright © 2011-2022 走看看