zoukankan      html  css  js  c++  java
  • 基于Java的数据采集(三)

    《基于Java的数据采集(一)》:http://www.cnblogs.com/lichenwei/p/3904715.html

    《基于Java的数据采集(二)》:http://www.cnblogs.com/lichenwei/p/3905370.html

    《基于Java的数据采集(终结篇)》:http://www.cnblogs.com/lichenwei/p/3910492.html

    基于之前2篇Java数据采集入库,做了下功能整合,实现本地的存读取,上个效果图:

    直接上代码吧,本程序只是作为"如何用JAVA抓取页面简单采集入库"的入门,在实际做采集工具的时候,还需考虑许多东西,比如当采集一个页面发生卡顿时,发生延迟时怎么办?等一系列的问题,希望这篇文字能够抛砖引玉。

    先看下项目结构:

    一共有五个类:

    Mysql.java  --数据库操作类

    RegEX.java   --正则匹配类

    GetAllData.java --采集类

    Action.java  --功能实现类

    FootBallMain.java --主程序类

    其他的,直接结合前面2篇文章外加看代码注释吧

    Mysql.java

     1 package com.lcw.curl;
     2 
     3 
     4 import java.sql.Connection;
     5 import java.sql.DriverManager;
     6 import java.sql.ResultSet;
     7 import java.sql.SQLException;
     8 import java.sql.Statement;
     9 
    10 
    11 /**
    12  * 数据库操作类,一更新,一查询
    13  * @author Balla_兔子
    14  *
    15  */
    16 public class MySql {
    17   
    18     //定义MySql驱动,数据库地址,数据库用户名 密码, 执行语句和数据库连接  
    19     public String driver = "com.mysql.jdbc.Driver";
    20     public String url = "jdbc:mysql://127.0.0.1:3306/football";
    21     public String user = "root";
    22     public String password = "";
    23     public Statement stmt = null;
    24     public Connection conn = null;
    25     
    26     //创建一个插入数据的方法
    27     public void datatoMySql(String insertSQl) {
    28 
    29         try {
    30             try {
    31                 Class.forName(driver).newInstance();
    32             } catch (Exception e) {
    33                 System.out.println("Unable to find the local driver");
    34                 e.printStackTrace();
    35             }
    36             //创建连接
    37             conn = DriverManager.getConnection(url, user, password);
    38             //创建一个 Statement 对象来将 SQL 语句发送到数据库
    39             stmt = conn.createStatement();
    40         } catch (SQLException e) {
    41             e.printStackTrace();
    42         }
    43         try {
    44             //执行SQL 插入语句
    45             stmt.executeUpdate(insertSQl);
    46         } catch (SQLException e) {
    47             e.printStackTrace();
    48         }
    49         try {
    50             stmt.close();
    51             conn.close();
    52         } catch (SQLException e) {
    53             e.printStackTrace();
    54         }
    55     }
    56     
    57     
    58   //创建一个查找数据的方法
    59     public ResultSet searchMySql(String selectSQl) {
    60         
    61         ResultSet rs=null;
    62 
    63         try {
    64             try {
    65                 Class.forName(driver).newInstance();
    66             } catch (Exception e) {
    67                 System.out.println("Unable to find the local driver");
    68                 e.printStackTrace();
    69             }
    70             //创建连接
    71             conn = DriverManager.getConnection(url, user, password);
    72             //创建一个 Statement 对象来将 SQL 语句发送到数据库
    73             stmt = conn.createStatement();
    74         } catch (SQLException e) {
    75             e.printStackTrace();
    76         }
    77         try {
    78             //执行SQL 插入语句
    79             rs=stmt.executeQuery(selectSQl);
    80         } catch (SQLException e) {
    81             e.printStackTrace();
    82         }
    83         
    84         return rs;
    85     }
    86     
    87 }
    Mysql.java

    RegEX.java

     1 package com.lcw.curl;
     2 
     3 import java.util.regex.Matcher;
     4 import java.util.regex.Pattern;
     5 
     6 public class RegEX {
     7 
     8     /**
     9      * 
    10      * @param regex
    11      * 正则表达式
    12      * @param content
    13      * 所要匹配的内容
    14      * @return
    15      */
    16     public String getData(String regex, String content) {
    17         Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);// 设定正则表达式,不区分大小写
    18         Matcher matcher = pattern.matcher(content);
    19         if (matcher.find()) {
    20             return matcher.group();//返回正则匹配结果
    21         } else {
    22             return "";
    23         }
    24     }
    25 
    26 }
    RegEX.java

    GetAllData.java

     1 package com.lcw.curl;
     2 
     3 import java.io.BufferedReader;
     4 import java.io.InputStreamReader;
     5 import java.net.URL;
     6 
     7 public class GetAllData {
     8 
     9     /**采集类
    10      * @param Balla_兔子
    11      */
    12     public void getAllData() {
    13 
    14         try {
    15             String address = "http://www.footballresults.org/league.php?league=EngDiv1";
    16             URL url = new URL(address);
    17             InputStreamReader inputStreamReader = new InputStreamReader(url
    18                     .openStream(), "utf-8");// 打开地址,以UTF-8编码的形式返回字节并转为字符
    19             BufferedReader bufferedReader = new BufferedReader(
    20                     inputStreamReader);// 从字符输入流中读取文本,缓冲各个字符,从而提供字符、数组和行的高效读取。
    21 
    22             RegEX data = new RegEX();
    23             MySql mySql = new MySql();
    24             String content = "";// 用来接受每次读取的行字符
    25             int flag = 0;// 标志,队伍信息刚好在日期信息后面,则正则相同,用于分离数据
    26             String dateRegex = "\d{1,2}\.\d{1,2}\.\d{4}";// 日期匹配正则表达式
    27             String teamRegex = ">[^<>]*</a>";// 队伍匹配正则表达式
    28             String scoreRegex = ">(\d{1,2}-\d{1,2})</TD>";// 比分正则表达式
    29             String tempDate = "";// 存储临时比赛时间
    30             String teama = "";// 存储临时主队
    31             String teamb = "";// 存储临时客队
    32             String score = "";// 存储临时比分
    33             int i = 0;// 记录信息条数
    34             String sql = "";// 数据库语句
    35 
    36             while ((content = bufferedReader.readLine()) != null) {// 每次读取一行数据
    37                 // 获取比赛日期信息
    38                 String dateInfo = data.getData(dateRegex, content);
    39                 if (!dateInfo.equals("")) {
    40                     // System.out.println("日期:" + dateInfo);
    41                     tempDate = dateInfo;
    42                     flag++;
    43                 }
    44                 // 获取队伍信息,需先读到日期信息让标志符自增
    45                 String teamInfo = data.getData(teamRegex, content);
    46                 if (!teamInfo.equals("") && flag == 1) {
    47                     teama = teamInfo.substring(1, teamInfo.indexOf("</a>"));
    48                     // System.out.println("主队:" + teama);
    49                     flag++;
    50                 } else if (!teamInfo.equals("") && flag == 2) {
    51                     teamb = teamInfo.substring(1, teamInfo.indexOf("</a>"));
    52                     // System.out.println("客队:" + teamb);
    53                     flag = 0;
    54                 }
    55                 // 获取比分信息
    56                 String scoreInfo = data.getData(scoreRegex, content);
    57                 if (!scoreInfo.equals("")) {
    58                     score = scoreInfo.substring(1, scoreInfo.indexOf("</TD>"));
    59                     // System.out.println("比分:" + score);
    60                     // System.out.println();
    61                     i++;
    62                     sql = "insert into football(`date`,`teama`,`teamb`,`score`) values('"
    63                             + tempDate
    64                             + "','"
    65                             + teama
    66                             + "','"
    67                             + teamb
    68                             + "','"
    69                             + score + "')";
    70                     mySql.datatoMySql(sql);
    71                     System.out.println("存储数据成功:" + i + "条");
    72                 }
    73 
    74             }
    75             bufferedReader.close();
    76             // System.out.println("一共收集到了" + i + "条信息");
    77         } catch (Exception e) {
    78             e.printStackTrace();
    79         }
    80 
    81     }
    82 
    83 }
    GetAllData.java

    Action.java

      1 package com.lcw.curl;
      2 
      3 import java.sql.ResultSet;
      4 import java.sql.SQLException;
      5 import java.util.ArrayList;
      6 import java.util.List;
      7 import java.util.Vector;
      8 
      9 public class Action {
     10 
     11     /**
     12      * 操作一:初始化数据库数据
     13      */
     14     public void initData() {
     15         String sql = "delete from football";
     16         MySql doMySql = new MySql();
     17         try {
     18             doMySql.datatoMySql(sql);
     19             System.out.println("数据初始化完毕!");
     20         } catch (Exception e) {
     21             System.out.println("数据初始化失败!");
     22         }
     23 
     24     }
     25 
     26     /**
     27      * 获取所有队伍信息
     28      * 
     29      * @return
     30      */
     31     public Vector<String> getAllTeam() {
     32         ResultSet rs = null;
     33         Vector<String> vector = new Vector<String>();
     34         String sql = "select teama,teamb from football";
     35         MySql doMySql = new MySql();
     36         rs = doMySql.searchMySql(sql);
     37 
     38         try {
     39             while (rs.next()) {
     40                 try {
     41                     if (!vector.contains(rs.getString("teama"))) {
     42                         vector.add(rs.getString("teama"));
     43                     }
     44                     if (!vector.contains(rs.getString("teamb"))) {
     45                         vector.add(rs.getString("teamb"));
     46                     }
     47                 } catch (SQLException e) {
     48                     e.printStackTrace();
     49                 }
     50             }
     51         } catch (SQLException e) {
     52             e.printStackTrace();
     53         }
     54 
     55         return vector;
     56 
     57     }
     58 
     59     /**
     60      * 获取具体某队的比赛信息
     61      * 
     62      * @param team
     63      * @return
     64      */
     65     public List<String> findTeam(String team) {
     66         List<String> list = new ArrayList<String>();
     67         String sql = "select * from football where teama ='" + team
     68                 + "' or teamb ='" + team + "'";
     69         MySql mysql = new MySql();
     70         ResultSet rs = null;
     71         rs = mysql.searchMySql(sql);
     72         try {
     73             while (rs.next()) {
     74                 list.add(rs.getString("date"));
     75                 list.add(rs.getString("teama"));
     76                 list.add(rs.getString("teamb"));
     77                 list.add(rs.getString("score"));
     78             }
     79         } catch (SQLException e) {
     80             e.printStackTrace();
     81         }
     82         return list;
     83 
     84     }
     85 
     86     public List<String> findGame(String date) {
     87         List<String> list = new ArrayList<String>();
     88         ResultSet rs = null;
     89         String sql = "select * from football where date ='" + date + "'";
     90         MySql mysql = new MySql();
     91         rs = mysql.searchMySql(sql);
     92         try {
     93             while (rs.next()) {
     94                 list.add(rs.getString("date"));
     95                 list.add(rs.getString("teama"));
     96                 list.add(rs.getString("teamb"));
     97                 list.add(rs.getString("score"));
     98             }
     99         } catch (SQLException e) {
    100             // TODO Auto-generated catch block
    101             e.printStackTrace();
    102         }
    103         return list;
    104     }
    105 
    106 }
    Action.java

    FootBallMain.java

     1 package com.lcw.curl;
     2 
     3 import java.util.List;
     4 import java.util.Scanner;
     5 import java.util.Vector;
     6 
     7 public class FootBallMain {
     8 
     9     /**主程序类
    10      * @param Balla_兔子
    11      */
    12     public static void main(String[] args) {
    13         GetAllData allData = new GetAllData();
    14         Action action = new Action();
    15 
    16         while (true) {
    17             System.out.println("①初始化数据库-请按 (1)");
    18             System.out.println("②自动化采集数据-请按(2)");
    19             System.out.println("③查询参赛队伍-请按(3)");
    20             System.out.println("④查询具体球队比赛结果-请按(4)");
    21             System.out.println("⑤查询具体某天的比赛详情-请按(5)");
    22             Scanner scanner = new Scanner(System.in);
    23             String input = scanner.next();
    24             if (input.equals("1")) {
    25                 System.out.println();
    26                 action.initData();
    27                 System.out
    28                         .println("-----------------------------------------------------");
    29             } else if (input.equals("2")) {
    30                 System.out.println("正在采集数据...请稍后");
    31                 allData.getAllData();
    32                 System.out
    33                         .println("-----------------------------------------------------");
    34             } else if (input.equals("3")) {
    35                 Vector<String> allTeam = action.getAllTeam();
    36                 System.out.println("正在获取数据...请稍后");
    37                 if (allTeam.size() != 0) {
    38                     System.out.println("参赛队伍如下:");
    39                     for (int i = 0; i < allTeam.size(); i++) {
    40                         System.out.println(allTeam.get(i));
    41                     }
    42                 }
    43                 System.out
    44                         .println("-----------------------------------------------------");
    45             } else if (input.equals("4")) {
    46                 System.out.println("请输入您要查询的队伍名:");
    47                 String team = scanner.next();
    48                 List<String> list = action.findTeam(team);
    49                 System.out.println("比赛日期			主队		客队			比赛结果");
    50                 if (list.size() != 0) {
    51                     for (int i = 0; i < list.size(); i++) {
    52                         System.out.print(list.get(i) + "		");
    53                     }
    54                 } else {
    55                     System.out.println("暂时没有您所提供队伍的比赛信息,敬请关注...");
    56                 }
    57                 System.out.println();
    58                 System.out
    59                         .println("-----------------------------------------------------");
    60             } else if (input.equals("5")) {
    61                 System.out.println("请输入您要查询日期(格式如下:xx.xx.xxxx):");
    62                 String date = scanner.next();
    63                 List<String> info = action.findGame(date);
    64                 System.out.println("比赛日期			主队		客队			比赛结果");
    65                 if (info.size() != 0) {
    66                     for (int i = 0; i < info.size(); i++) {
    67                         if (i % 4 == 0 && i != 0) {
    68                             System.out.println();
    69                         }
    70                         System.out.print(info.get(i) + "		");
    71                     }
    72                 } else {
    73                     System.out.println("暂时没有您所提供的比赛信息,敬请关注...");
    74                 }
    75                 System.out.println();
    76                 System.out
    77                         .println("------------------------------------------------------------------------");
    78             } else {
    79                 System.out.println("请输入正确的对应编号..");
    80                 System.out
    81                         .println("------------------------------------------------------------------------");
    82             }
    83         }
    84     }
    85 
    86 }
    FootBallMain.java
  • 相关阅读:
    四则运算出题器
    四则运算出题网页
    四则运算自动生成器实现(python、wxpython、GUI)
    python 实现小学四则运算
    Process and Thread States
    COS AP-开启WPA后无法关联SSID!
    WLC MAC Filtering
    禅道--个人理解 简单介绍
    IDEA解决乱码
    avue 实现自定义列显隐并保存,并且搜索表单、form表单、crud列顺序互不影响。
  • 原文地址:https://www.cnblogs.com/lichenwei/p/3907007.html
Copyright © 2011-2022 走看看