zoukankan      html  css  js  c++  java
  • .net网站数据抓取

    最新项目需要抓取人民币汇率中间价的数据,所以就写了个简单的爬虫抓取数据。抓取的网站为:http://www.safe.gov.cn/wps/portal/sy/tjsj_hlzjj_inquire

    #region 执行数据抓取(人民币汇率中间价)
    /// <summary>
    /// 执行数据抓取(人民币汇率中间价)
    /// </summary>
    public void CaptureData()
    {
                isExecuting = true;
                StringBuilder msg = new StringBuilder();
                msg.AppendFormat("执行时间:{0}
    ", DateTime.Now);
                msg.Append("开始抓取人民币汇率中间价...
    
    ");
                SetLogging(msg.ToString());
    
    
                var db = new dbContext();
                var trans = db.Database.BeginTransaction();
                string title = "";
                DateTime dt = DateTime.Now;
                if (executeType == "true")
                {
                    title += dt.ToString("yyyy-MM-dd") + "的数据抓取";
                }
                else
                {
                    title += "时间范围为:开始时间为" + startTime + ",结束时间为" + endTime + "的数据抓取";
                }
    
                try
                {
                    string url = "";
                    string basePath = "http://www.safe.gov.cn/AppStructured/view/project_RMBQuery.action";
                    if(executeType == "true")
                    {
                        var date = DateTime.Now.ToString("yyyy-MM-dd");
                        url = basePath + "?projectBean.startDate=" + date + "&projectBean.endDate=" + date + "&queryYN=true";
                    }
                    else
                    {
                        url = basePath + "?projectBean.startDate=" + startTime + "&projectBean.endDate=" + endTime + "&queryYN=true";
                    }
    
                    //string url = "http://www.safe.gov.cn/AppStructured/view/project_RMBQuery.action?projectBean.startDate=2017-03-15&projectBean.endDate=2017-03-15&queryYN=true";
                    //发送请求
                    HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url);
                    request.Method = "GET";
                    request.ProtocolVersion = HttpVersion.Version11;
                    request.UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36";
                    request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";
                    request.Timeout = 100000;  
                    request.Headers.Add("Accept-Encoding", "gzip, deflate");
                    request.Headers.Add("Accept-Language", "zh-CN,zh;q=0.8");
                    request.Headers.Add("Accept-Charset", "GBK,utf-8;q=0.7,*;q=0.3");
                    request.CookieContainer = new CookieContainer();
                    //接收请求
                    HttpWebResponse response = (HttpWebResponse)request.GetResponse();
                    Stream stream = response.GetResponseStream();
                    string resultStr = ""; //返回字符串,若翻译失败则返回空字符串
                    using (StreamReader reader = new StreamReader(stream, Encoding.GetEncoding("utf-8")))
                    {
                        //开始解释结果
                        string result = reader.ReadToEnd();
                        if(!string.IsNullOrEmpty(result))
                        {
                            result = result.Replace("
    ","");       //过滤
    转换为空
                            result = result.Replace("
    ", "");      //过滤
    转换为空
                            result = result.Replace("	", "");      //过滤	转换为空
                            result = result.Replace("\", "");      //过滤转换为空                                                            
                            result = Regex.Replace(result, @"<!--(?s).*?-->", "", RegexOptions.IgnoreCase);     //过滤注释
                            result = result.Replace("&nbsp;", "");  //过滤nbsp标签
                            string tableHtml = Regex.Match(result, "<table class="list" id="InfoTable".*>.*</table>").ToString();
                            MatchCollection trHtmls = Regex.Matches(tableHtml, "<tr class="first".*?>(.*?)</tr>");
                            foreach (Match tr in trHtmls)
                            {
                                #region 插入一条信息
                                Regex reg2 = new Regex("<td.*?>(.*?)</td>");
                                MatchCollection mc2 = reg2.Matches(tr.Value);
                                List<string> dataList = new List<string>();
    
                                Match[] marr = mc2.OfType<Match>().ToArray();
                                for(int i=0;i<marr.Length;i++)
                                {
                                    var value = marr[i].Groups[1].Value;
                                    dataList.Add(value);
                                }
    
                                var dateTime = Convert.ToDateTime(dataList[0]);
                                var item = db.RMB_EXCHANGERATE.Where(p => p.TIME == dateTime).FirstOrDefault();
                                if(item == null)
                                {
                                    RMB_EXCHANGERATE obj = new RMB_EXCHANGERATE();
                                    obj.ID = Guid.NewGuid().ToString();
                                    obj.TIME = Convert.ToDateTime(dataList[0]);
                                    obj.USD = Convert.ToDecimal(dataList[1]);
                                    obj.EUR = Convert.ToDecimal(dataList[2]);
                                    obj.JPY = Convert.ToDecimal(dataList[3]);
                                    obj.HKD = Convert.ToDecimal(dataList[4]);
                                    obj.GBP = Convert.ToDecimal(dataList[5]);
                                    obj.MYR = Convert.ToDecimal(dataList[6]);
                                    obj.SUR = Convert.ToDecimal(dataList[7]);
                                    obj.ZAR = Convert.ToDecimal(dataList[8]);
                                    obj.KRW = Convert.ToDecimal(dataList[9]);
                                    obj.AED = Convert.ToDecimal(dataList[10]);
                                    obj.SR = Convert.ToDecimal(dataList[11]);
                                    obj.HUF = Convert.ToDecimal(dataList[12]);
                                    obj.PLN = Convert.ToDecimal(dataList[13]);
                                    obj.DKK = Convert.ToDecimal(dataList[14]);
                                    obj.SEK = Convert.ToDecimal(dataList[15]);
                                    obj.NOK = Convert.ToDecimal(dataList[16]);
                                    obj.ITL = Convert.ToDecimal(dataList[17]);
                                    obj.PHP = Convert.ToDecimal(dataList[18]);
                                    obj.AUD = Convert.ToDecimal(dataList[19]);
                                    obj.CAD = Convert.ToDecimal(dataList[20]);
                                    obj.NZD = Convert.ToDecimal(dataList[21]);
                                    obj.SGD = Convert.ToDecimal(dataList[22]);
                                    obj.CHF = Convert.ToDecimal(dataList[23]);
                                    obj.CREATETIME = DateTime.Now;
                                    db.RMB_EXCHANGERATE.Add(obj);
                                }
                                #endregion
                            }
    
                            db.SaveChanges();
                            trans.Commit();
    
                            StringBuilder msg2 = new StringBuilder();
                            msg2.AppendFormat("执行时间:{0}
    ", DateTime.Now);
                            msg2.AppendFormat("{0}成功
    
    ",title);
                            SetLogging(msg2.ToString());
                        }
                        else
                        {
                            StringBuilder msg2 = new StringBuilder();
                            msg2.AppendFormat("执行时间:{0}
    ", DateTime.Now);
                            msg2.AppendFormat("{0}为空
    
    
    ",title);
                            SetLogging(msg2.ToString());
                        }                  
                    }
    
                    
                    isExecuting = false;    //无论执行成功还是失败,完成后都要恢复状态
                }
                catch (Exception ex)
                {
                    trans.Rollback();
                    var message = logTemplate2(ex, title+"失败");
                    SetLogging(message);
                    if (ex.Message == "请求超时")
                    {
                        //循环抓取
                        CaptureData();
                    }
    
                    isExecuting = false;    //无论执行成功还是失败,完成后都要恢复状态
                }
            }
            #endregion

     

     

  • 相关阅读:
    struts2简介
    项目整合SpringDataRedis
    SpringDataRedis入门Demo
    包管理-rpm
    文件查找与压缩
    N042第一周
    Shell
    Linux下终端字体颜色设置方法
    文本处理工具作业
    正则表达式
  • 原文地址:https://www.cnblogs.com/kehaocheng/p/7503812.html
Copyright © 2011-2022 走看看