zoukankan      html  css  js  c++  java
  • .net网站数据抓取

    最新项目需要抓取人民币汇率中间价的数据,所以就写了个简单的爬虫抓取数据。抓取的网站为:http://www.safe.gov.cn/wps/portal/sy/tjsj_hlzjj_inquire

    #region 执行数据抓取(人民币汇率中间价)
    /// <summary>
    /// 执行数据抓取(人民币汇率中间价)
    /// </summary>
    public void CaptureData()
    {
                isExecuting = true;
                StringBuilder msg = new StringBuilder();
                msg.AppendFormat("执行时间:{0}
    ", DateTime.Now);
                msg.Append("开始抓取人民币汇率中间价...
    
    ");
                SetLogging(msg.ToString());
    
    
                var db = new dbContext();
                var trans = db.Database.BeginTransaction();
                string title = "";
                DateTime dt = DateTime.Now;
                if (executeType == "true")
                {
                    title += dt.ToString("yyyy-MM-dd") + "的数据抓取";
                }
                else
                {
                    title += "时间范围为:开始时间为" + startTime + ",结束时间为" + endTime + "的数据抓取";
                }
    
                try
                {
                    string url = "";
                    string basePath = "http://www.safe.gov.cn/AppStructured/view/project_RMBQuery.action";
                    if(executeType == "true")
                    {
                        var date = DateTime.Now.ToString("yyyy-MM-dd");
                        url = basePath + "?projectBean.startDate=" + date + "&projectBean.endDate=" + date + "&queryYN=true";
                    }
                    else
                    {
                        url = basePath + "?projectBean.startDate=" + startTime + "&projectBean.endDate=" + endTime + "&queryYN=true";
                    }
    
                    //string url = "http://www.safe.gov.cn/AppStructured/view/project_RMBQuery.action?projectBean.startDate=2017-03-15&projectBean.endDate=2017-03-15&queryYN=true";
                    //发送请求
                    HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url);
                    request.Method = "GET";
                    request.ProtocolVersion = HttpVersion.Version11;
                    request.UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36";
                    request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";
                    request.Timeout = 100000;  
                    request.Headers.Add("Accept-Encoding", "gzip, deflate");
                    request.Headers.Add("Accept-Language", "zh-CN,zh;q=0.8");
                    request.Headers.Add("Accept-Charset", "GBK,utf-8;q=0.7,*;q=0.3");
                    request.CookieContainer = new CookieContainer();
                    //接收请求
                    HttpWebResponse response = (HttpWebResponse)request.GetResponse();
                    Stream stream = response.GetResponseStream();
                    string resultStr = ""; //返回字符串,若翻译失败则返回空字符串
                    using (StreamReader reader = new StreamReader(stream, Encoding.GetEncoding("utf-8")))
                    {
                        //开始解释结果
                        string result = reader.ReadToEnd();
                        if(!string.IsNullOrEmpty(result))
                        {
                            result = result.Replace("
    ","");       //过滤
    转换为空
                            result = result.Replace("
    ", "");      //过滤
    转换为空
                            result = result.Replace("	", "");      //过滤	转换为空
                            result = result.Replace("\", "");      //过滤转换为空                                                            
                            result = Regex.Replace(result, @"<!--(?s).*?-->", "", RegexOptions.IgnoreCase);     //过滤注释
                            result = result.Replace("&nbsp;", "");  //过滤nbsp标签
                            string tableHtml = Regex.Match(result, "<table class="list" id="InfoTable".*>.*</table>").ToString();
                            MatchCollection trHtmls = Regex.Matches(tableHtml, "<tr class="first".*?>(.*?)</tr>");
                            foreach (Match tr in trHtmls)
                            {
                                #region 插入一条信息
                                Regex reg2 = new Regex("<td.*?>(.*?)</td>");
                                MatchCollection mc2 = reg2.Matches(tr.Value);
                                List<string> dataList = new List<string>();
    
                                Match[] marr = mc2.OfType<Match>().ToArray();
                                for(int i=0;i<marr.Length;i++)
                                {
                                    var value = marr[i].Groups[1].Value;
                                    dataList.Add(value);
                                }
    
                                var dateTime = Convert.ToDateTime(dataList[0]);
                                var item = db.RMB_EXCHANGERATE.Where(p => p.TIME == dateTime).FirstOrDefault();
                                if(item == null)
                                {
                                    RMB_EXCHANGERATE obj = new RMB_EXCHANGERATE();
                                    obj.ID = Guid.NewGuid().ToString();
                                    obj.TIME = Convert.ToDateTime(dataList[0]);
                                    obj.USD = Convert.ToDecimal(dataList[1]);
                                    obj.EUR = Convert.ToDecimal(dataList[2]);
                                    obj.JPY = Convert.ToDecimal(dataList[3]);
                                    obj.HKD = Convert.ToDecimal(dataList[4]);
                                    obj.GBP = Convert.ToDecimal(dataList[5]);
                                    obj.MYR = Convert.ToDecimal(dataList[6]);
                                    obj.SUR = Convert.ToDecimal(dataList[7]);
                                    obj.ZAR = Convert.ToDecimal(dataList[8]);
                                    obj.KRW = Convert.ToDecimal(dataList[9]);
                                    obj.AED = Convert.ToDecimal(dataList[10]);
                                    obj.SR = Convert.ToDecimal(dataList[11]);
                                    obj.HUF = Convert.ToDecimal(dataList[12]);
                                    obj.PLN = Convert.ToDecimal(dataList[13]);
                                    obj.DKK = Convert.ToDecimal(dataList[14]);
                                    obj.SEK = Convert.ToDecimal(dataList[15]);
                                    obj.NOK = Convert.ToDecimal(dataList[16]);
                                    obj.ITL = Convert.ToDecimal(dataList[17]);
                                    obj.PHP = Convert.ToDecimal(dataList[18]);
                                    obj.AUD = Convert.ToDecimal(dataList[19]);
                                    obj.CAD = Convert.ToDecimal(dataList[20]);
                                    obj.NZD = Convert.ToDecimal(dataList[21]);
                                    obj.SGD = Convert.ToDecimal(dataList[22]);
                                    obj.CHF = Convert.ToDecimal(dataList[23]);
                                    obj.CREATETIME = DateTime.Now;
                                    db.RMB_EXCHANGERATE.Add(obj);
                                }
                                #endregion
                            }
    
                            db.SaveChanges();
                            trans.Commit();
    
                            StringBuilder msg2 = new StringBuilder();
                            msg2.AppendFormat("执行时间:{0}
    ", DateTime.Now);
                            msg2.AppendFormat("{0}成功
    
    ",title);
                            SetLogging(msg2.ToString());
                        }
                        else
                        {
                            StringBuilder msg2 = new StringBuilder();
                            msg2.AppendFormat("执行时间:{0}
    ", DateTime.Now);
                            msg2.AppendFormat("{0}为空
    
    
    ",title);
                            SetLogging(msg2.ToString());
                        }                  
                    }
    
                    
                    isExecuting = false;    //无论执行成功还是失败,完成后都要恢复状态
                }
                catch (Exception ex)
                {
                    trans.Rollback();
                    var message = logTemplate2(ex, title+"失败");
                    SetLogging(message);
                    if (ex.Message == "请求超时")
                    {
                        //循环抓取
                        CaptureData();
                    }
    
                    isExecuting = false;    //无论执行成功还是失败,完成后都要恢复状态
                }
            }
            #endregion

     

     

  • 相关阅读:
    Generate Parentheses
    Length of Last Word
    Maximum Subarray
    Count and Say
    二分搜索算法
    Search Insert Position
    Implement strStr()
    Remove Element
    Remove Duplicates from Sorted Array
    Remove Nth Node From End of List
  • 原文地址:https://www.cnblogs.com/kehaocheng/p/7503812.html
Copyright © 2011-2022 走看看