zoukankan      html  css  js  c++  java
  • 爬虫双色球所有的历史数据并保存到SQLite

    前言

    上一篇介绍了双色球走势图是怎么实现的,这一篇介绍怎么实现爬虫所有的双色球历史数据,也可以同步分享怎么同步福彩3D数据。采用的C#来实现的。

    同步双色球的地址:https://datachart.500.com/ssq/history/newinc/history.php?start={0}&end={1} 

    同步福彩3D的地址:https://datachart.500.com/sd/history/inc/history.php?start={0}&end={1}

    上一篇介绍走势图的实现:https://www.cnblogs.com/luoyuhao/p/13887935.html

    打开网站显示的数据如下:

    实现爬虫的过程

    创建接收存储双色球历史数据的table

     抓取网站上的html ,再通过html 去解析相应的数据。

    通过以上截取后就只剩下需要的双色球的历史数据了,再去截取tr td数据,就相应能到得到相应的数据了,是不是很简单的一件事呀!

    解析tr所有数据

            /// <summary>
            /// 双色球TR
            /// </summary>
            /// <param name="wnRepo"></param>
            /// <param name="content"><tbody></tbody>之间的内容</param>
            private void ResolveSSQTr(string content)
            {
                string trContent = string.Empty;
                Regex regex = new Regex("<tr class="t_tr1">");
                //在<tbody></tbody>之间的内容搜索所有匹配<tr>的项
                MatchCollection matches = regex.Matches(content);
                foreach (Match item in matches)
                {
                    //如果当前匹配项的下一个匹配项的值不为空
                    if (!string.IsNullOrEmpty(item.NextMatch().Value))
                    {
                        trContent = content.Substring(item.Index, item.NextMatch().Index - item.Index);
                    }
                    //最后一个<tr>的匹配项
                    else
                    {
                        trContent = content.Substring(item.Index, content.Length - item.Index);
                    }
                    DataRow dr = ssqdt.NewRow();
                    ResolveSSQTd(ref dr, trContent);
    
                    ssqdt.Rows.Add(dr);
                }
    }

    解析所有td内容

     /// <summary>
            /// 双色球TD
            /// </summary>
            /// <param name="dr"></param>
            /// <param name="trContent"></param>
            private void ResolveSSQTd(ref DataRow dr, string trContent)
            {
                List<int> redBoxList = null;
                //匹配期号的表达式
                string patternQiHao = "<td>";
                Regex regex = new Regex(patternQiHao);
                Match qhMatch = regex.Match(trContent);
                dr["QiHao"] = trContent.Substring(qhMatch.Index + 13 + patternQiHao.Length, 5);
    
                if (int.Parse(trContent.Substring(qhMatch.Index + 13 + patternQiHao.Length, 5)) % 2 == 0)
                {
                    dr["Type"] = "双期";
                }
                else
                {
                    dr["Type"] = "单期";
                }
    
                //存放匹配出来的红球号码
                redBoxList = new List<int>();
                //匹配红球的表达式
                string patternChartBall = "<td class="t_cfont2">";
                regex = new Regex(patternChartBall);
                MatchCollection rMatches = regex.Matches(trContent);
                foreach (Match r in rMatches)
                {
                    redBoxList.Add(Convert.ToInt32(trContent.Substring(r.Index + patternChartBall.Length, 2)));
                }
    
                //匹配红球的表达式
                patternChartBall = "<td class="t_cfont4">";
                regex = new Regex(patternChartBall);
                rMatches = regex.Matches(trContent);
    
                foreach (Match r in rMatches)
                {
                    dr["B"] = Convert.ToInt32(trContent.Substring(r.Index + patternChartBall.Length, 2));
                    break;
                }
    
                patternChartBall = @"d{4}-d{2}-d{2}";
                regex = new Regex(patternChartBall);
                rMatches = regex.Matches(trContent);
                foreach (Match r in rMatches)
                {
                    DateTime dt = Convert.ToDateTime(r.Value);
                    dr["OpenDate"] = dt;
                    dr["Week"] = CaculateWeekDay(dt.Year, dt.Month, dt.Day);
                }
    
                //排序红球号码
                redBoxList.Sort();
                //红球号码
                dr["R1"] = redBoxList[0];
                dr["R2"] = redBoxList[1];
                dr["R3"] = redBoxList[2];
                dr["R4"] = redBoxList[3];
                dr["R5"] = redBoxList[4];
                dr["R6"] = redBoxList[5];
                dr["HTML"] = trContent;
            }

    保存所解析的数据保存到SQLite

    if (!Directory.Exists(tempPath))//判断文件夹是否存在
                    Directory.CreateDirectory(tempPath);//创建文件夹在根目录下
                string _db = System.IO.Path.Combine(tempPath, "cpdb.dll");
                SQLiteHelper helper = new SQLiteHelper(_db);
                OperResult oper = new OperResult();
    
                helper.ExecuteNonQuery(string.Format("drop table if exists {0}", ssqtableName), _db);
    
                string msg = string.Format("create table if not exists {0} (
    ", ssqtableName);
                for (int i = 0; i < ssqdt.Columns.Count; i++)
                {
                    msg += string.Format("{0} {1},
    ", ssqdt.Columns[i].ColumnName, helper.TypeToSqliteType(ssqdt.Columns[i].DataType));
                }
                msg = msg.Remove(msg.LastIndexOf(",
    "), 3);
                msg += ")";
    
                oper = helper.ExecuteNonQuery(msg, _db);
                oper = helper.SaveDataTable(ssqdt, ssqtableName);

    获取网页内容的方法

          public static string HttpGet(string url,string encoding)
            {
                WebRequest request = HttpWebRequest.Create(url);
                WebResponse response = request.GetResponse();
               
                Stream stream = response.GetResponseStream();
                StreamReader reader = new StreamReader(stream, Encoding.GetEncoding(encoding));
                string content = reader.ReadToEnd();
                return content;
            }

    基姆拉尔森计算公式计算日期

    /// <summary>      
            /// 基姆拉尔森计算公式计算日期
            /// </summary> 
            /// <param name="y"></param> 
            /// <param name="m"></param> 
            /// <param name="d"></param> 
            /// <returns>星期几</returns> 
            protected string CaculateWeekDay(int y, int m, int d)
            {
                if (m == 1 || m == 2)
                {
                    m += 12;
                    y--;
                    //把一月和二月看成是上一年的十三月和十四月,例:如果是2004-1-10则换算成:2003-13-10来代入公式计算。
                }
                int week = (d + 2 * m + 3 * (m + 1) / 5 + y + y / 4 - y / 100 + y / 400) % 7;
                string weekstr = "";
                switch (week)
                {
                    case 0: weekstr = "周一"; break;
                    case 1: weekstr = "周二"; break;
                    case 2: weekstr = "周三"; break;
                    case 3: weekstr = "周四"; break;
                    case 4: weekstr = "周五"; break;
                    case 5: weekstr = "周六"; break;
                    case 6: weekstr = "周日"; break;
                }
                return weekstr;
            }

    保存完后,显示的形式就是以DLL结尾的文件了,如下图:

     怎么读取保存的双色球数据

       string _db = System.IO.Path.Combine(tempPath, "cpdb.dll");
                SQLiteHelper helper = new SQLiteHelper(_db);
                OperResult oper = new OperResult();
                oper = helper.GetDataSet(string.Format("select * from {0}", ssqtableName), _db);
                if (oper.State == 1 && oper.DataSet != null && oper.DataSet.Tables.Count > 0 && oper.DataSet.Tables[0].Rows.Count > 0)
                {
                    ViewBag.QiHao = "" + oper.DataSet.Tables[0].Rows[0]["QiHao"] + "";
                    ViewBag.B = oper.DataSet.Tables[0].Rows[0]["B"].ToString().Length == 1 ? "0" + oper.DataSet.Tables[0].Rows[0]["B"].ToString() : oper.DataSet.Tables[0].Rows[0]["B"].ToString();
                    ViewBag.R1 = oper.DataSet.Tables[0].Rows[0]["R1"].ToString().Length == 1 ? "0" + oper.DataSet.Tables[0].Rows[0]["R1"].ToString() : oper.DataSet.Tables[0].Rows[0]["R1"].ToString();
                    ViewBag.R2 = oper.DataSet.Tables[0].Rows[0]["R2"].ToString().Length == 1 ? "0" + oper.DataSet.Tables[0].Rows[0]["R2"].ToString() : oper.DataSet.Tables[0].Rows[0]["R2"].ToString();
                    ViewBag.R3 = oper.DataSet.Tables[0].Rows[0]["R3"].ToString().Length == 1 ? "0" + oper.DataSet.Tables[0].Rows[0]["R3"].ToString() : oper.DataSet.Tables[0].Rows[0]["R3"].ToString();
                    ViewBag.R4 = oper.DataSet.Tables[0].Rows[0]["R4"].ToString().Length == 1 ? "0" + oper.DataSet.Tables[0].Rows[0]["R4"].ToString() : oper.DataSet.Tables[0].Rows[0]["R4"].ToString();
                    ViewBag.R5 = oper.DataSet.Tables[0].Rows[0]["R5"].ToString().Length == 1 ? "0" + oper.DataSet.Tables[0].Rows[0]["R5"].ToString() : oper.DataSet.Tables[0].Rows[0]["R5"].ToString();
                    ViewBag.R6 = oper.DataSet.Tables[0].Rows[0]["R6"].ToString().Length == 1 ? "0" + oper.DataSet.Tables[0].Rows[0]["R6"].ToString() : oper.DataSet.Tables[0].Rows[0]["R6"].ToString();
                }

    以上即完成双色球历史数据的爬虫和保存数据的全部过程,思路非常清晰,而且方法容易,这样就不需要调用收费的接口了,为开发双色球走势图打好良好的基础 ,大家有没有觉得非常容易呀!其实爬虫的方法很多,希望跟大家一起交流学习了,也可以通过正则表达是来获取相应的数据,也可以用第三方工具去解析更简单快捷,曾经我也做过相应的爬虫,还挺好玩的。

    爬虫福彩3D数据

     同样显示创建接收数据的表结构来接收数据。

     解析tr数据

    string trContent = string.Empty;
                Regex regex = new Regex("<tr>");
                //在<tbody></tbody>之间的内容搜索所有匹配<tr>的项
                MatchCollection matches = regex.Matches(content);
                foreach (Match item in matches)
                {
                    //如果当前匹配项的下一个匹配项的值不为空
                    if (!string.IsNullOrEmpty(item.NextMatch().Value))
                    {
                        trContent = content.Substring(item.Index, item.NextMatch().Index - item.Index);
                    }
                    //最后一个<tr>的匹配项
                    else
                    {
                        trContent = content.Substring(item.Index, content.Length - item.Index);
                    }
                    //DataRow dr = sddt.NewRow();
    
                    ResolveSDSJTd(trContent);
    
                    //sddt.Rows.Add(dr);
                }

    解析td数据

     private void ResolveSDSJTd(string trContent)
            {
                string patternChartBall = @"[1-9]d{6}";
                Regex regex = new Regex(patternChartBall);
                MatchCollection rMatches = regex.Matches(trContent);
                string qihao = "";
                foreach (Match r in rMatches)
                {
                    qihao = r.Value;
                    break;
                }
    
                patternChartBall = "<td class="chartBall01" width=18>";
                regex = new Regex(patternChartBall);
                rMatches = regex.Matches(trContent);
                List<int> redBoxList = new List<int>();
    
                foreach (Match r in rMatches)
                {
                    int xx = int.Parse(trContent.Substring(r.Index + patternChartBall.Length, 1));
                    redBoxList.Add(xx);
                }
    
                if (!string.IsNullOrEmpty(qihao))
                {
                    DataRow[] dr = sddt.Select(string.Format("QiHao='{0}'", qihao));
                    if (dr.Length > 0)
                    {
                        dr[0]["R4"] = redBoxList[0];
                        dr[0]["R5"] = redBoxList[1];
                        dr[0]["R6"] = redBoxList[2];
                    }
                }
            }

    保存数据

    if (!Directory.Exists(tempPath))//判断文件夹是否存在
                    Directory.CreateDirectory(tempPath);//创建文件夹在根目录下
                string _db = System.IO.Path.Combine(tempPath, "sddb.dll");
                SQLiteHelper helper = new SQLiteHelper(_db);
                OperResult oper = new OperResult();
    
                helper.ExecuteNonQuery(string.Format("drop table if exists {0}", sdtableName), _db);
    
                string msg = string.Format("create table if not exists {0} (
    ", sdtableName);
                for (int i = 0; i < sddt.Columns.Count; i++)
                {
                    msg += string.Format("{0} {1},
    ", sddt.Columns[i].ColumnName, helper.TypeToSqliteType(sddt.Columns[i].DataType));
                }
                msg = msg.Remove(msg.LastIndexOf(",
    "), 3);
                msg += ")";
    
                oper = helper.ExecuteNonQuery(msg, _db);
                oper = helper.SaveDataTable(sddt, sdtableName);

    以上即可完成对福彩3D数据的爬虫,还能爬取福彩3D试机号,这些数据一些就能全部爬取完了,非常的快捷简单方便。

    双色球走势图如下:

    上一篇文章有介绍怎么去实现双色球走势图,这里就不再过多介绍了,这一篇主要是介绍怎么爬虫双色球和福彩3D数据。

    详细介绍:https://www.cnblogs.com/luoyuhao/p/13887935.html

    后记

    为了做好爬虫数据和走势图,爬虫技术只能为做好走势图是一个前提条件,两个步骤是相辅相成的,现在这两个技术都具备了,只是不断完善技术才能做出更强大的走势图的网站。以上只是纯技术交流和学习,欢迎大家一起留言交流,共同学习进步。

     可加QQ群:186841119

  • 相关阅读:
    Task示例,多线程
    request
    do put in ruby
    Ruby零星笔记
    Git的常用操作
    如何在Rails中执行Get/Post/Put请求
    Lua中的基本函数库
    Step By Step(Lua目录)
    position:fixed失效原因
    前端性能监控-window.performance.timing篇
  • 原文地址:https://www.cnblogs.com/luoyuhao/p/13893335.html
Copyright © 2011-2022 走看看