zoukankan      html  css  js  c++  java
  • 关于简单的hive练习

    现给定一个一千条的原始数据的txt文件,要求清洗掉多余字符,按照空格和换行规则导入hive中。

      1、导入txt文件

        使用BufferedReader方法导入txt文件,准备进行处理。

      2、清洗数据

        使用字符串分割函数split()将数据按照空格、/、+、,等字符进行分割。

      3、导出txt文件

        使用FileWriter方法导出txt文件,准备进行上传。

      4、上传文件

        将导出清洗完毕的数据文件上传至hdfs中。

      5、导入hive

        从hdfs中将文件导入hive。

     程序如下

    import java.io.BufferedReader;
    import java.io.File;
    import java.io.FileNotFoundException;
    import java.io.FileReader;
    import java.io.FileWriter;
    import java.io.IOException;
    import java.sql.Connection;
    import java.sql.PreparedStatement;
    import java.util.ArrayList;
    
    public class CleanData {
    
        public static ArrayList<String> ip = new ArrayList<String>();
        public static ArrayList<String> date = new ArrayList<String>();
        public static ArrayList<String> day = new ArrayList<String>();
        public static ArrayList<Long> traffic = new ArrayList<Long>();
        public static ArrayList<String> type = new ArrayList<String>();
        public static ArrayList<String> id = new ArrayList<String>();
    
        public static void cleanData() throws IOException {
            String str;
            File f = new File("/home/ryq1998/Documents/Tencent Files/316703799/FileRecv/result.txt");
            BufferedReader bf = new BufferedReader(new FileReader(f));
            try {
                while ((str = bf.readLine()) != null) {
                    String[] s = str.split(",");
                    ip.add(s[0]);
                    String[] newdate = s[1].split("\\|\:|\b|\+");
                    date.add(newdate[4] + "-" + "11" + "-" + newdate[0] + " " + newdate[6] + ":" + newdate[8] + ":"
                            + newdate[10]);
                    day.add(s[2]);
                    String[] newtriffic = s[3].split(" ");
                    traffic.add(Long.parseLong(newtriffic[0]));
                    type.add(s[4]);
                    id.add(s[5]);
                }
            } catch (FileNotFoundException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            } finally {
                bf.close();
                returnText(ip, date, day, traffic, type, id);
                /*
                 * 将数据插入mysql数据库
                 */
                /*addSql(ip, date, day, traffic, type, id);*/
                
            }
        }
    
        /*
         * 存储mysql数据库
         */
        public static void addSql(ArrayList<String> ip, ArrayList<String> date, ArrayList<String> day,
                ArrayList<Long> traffic, ArrayList<String> type, ArrayList<String> id) {
    
            Connection con = null;
            try {
                con = JdbcUtils.getConnection();
                PreparedStatement psql;
                for (int i = 0; i < ip.size(); i++) {
                    psql = con.prepareStatement(
                            "insert into CleanData(ip,date,day,traffic,type,id) " + "values(?,?,?,?,?,?)");
                    psql.setString(1, ip.get(i));
                    psql.setString(2, date.get(i));
                    psql.setString(3, day.get(i));
                    psql.setLong(4, traffic.get(i));
                    psql.setString(5, type.get(i));
                    psql.setString(6, id.get(i));
                    psql.executeUpdate();
                    psql.close();
                }
                con.close();
            } catch (Exception e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
    
        }
        
        public static void returnText(ArrayList<String> ip, ArrayList<String> date, ArrayList<String> day,
                ArrayList<Long> traffic, ArrayList<String> type, ArrayList<String> id) {
            
            FileWriter fileWriter = null;
            try {
                fileWriter = new FileWriter("/home/ryq1998/result.txt");//创建文本文件
                int i=0;
                for(;i<ip.size();i++) {
                    if(i==ip.size()-1) {
                        fileWriter.write(ip.get(i)+" "+date.get(i)+" "+day.get(i)+" "+traffic.get(i)+" "+type.get(i)+" "+id.get(i));
                        break;
                    }
                    fileWriter.write(ip.get(i)+" "+date.get(i)+" "+day.get(i)+" "+traffic.get(i)+" "+type.get(i)+" "+id.get(i)+"
    ");//写入 
    换行
                }
                fileWriter.flush();
                fileWriter.close();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            
            
            
        }
    
        public static void main(String[] args) throws IOException {
            cleanData();
        }
    
    }
    View Code

      截图如下

      

  • 相关阅读:
    UVALive 7141 BombX
    CodeForces 722D Generating Sets
    CodeForces 722C Destroying Array
    CodeForces 721D Maxim and Array
    CodeForces 721C Journey
    CodeForces 415D Mashmokh and ACM
    CodeForces 718C Sasha and Array
    CodeForces 635C XOR Equation
    CodeForces 631D Messenger
    田忌赛马问题
  • 原文地址:https://www.cnblogs.com/YXSZ/p/11854005.html
Copyright © 2011-2022 走看看