zoukankan      html  css  js  c++  java
  • 关于简单的hive练习

    现给定一个一千条的原始数据的txt文件,要求清洗掉多余字符,按照空格和换行规则导入hive中。

      1、导入txt文件

        使用BufferedReader方法导入txt文件,准备进行处理。

      2、清洗数据

        使用字符串分割函数split()将数据按照空格、/、+、,等字符进行分割。

      3、导出txt文件

        使用FileWriter方法导出txt文件,准备进行上传。

      4、上传文件

        将导出清洗完毕的数据文件上传至hdfs中。

      5、导入hive

        从hdfs中将文件导入hive。

     程序如下

    import java.io.BufferedReader;
    import java.io.File;
    import java.io.FileNotFoundException;
    import java.io.FileReader;
    import java.io.FileWriter;
    import java.io.IOException;
    import java.sql.Connection;
    import java.sql.PreparedStatement;
    import java.util.ArrayList;
    
    public class CleanData {
    
        public static ArrayList<String> ip = new ArrayList<String>();
        public static ArrayList<String> date = new ArrayList<String>();
        public static ArrayList<String> day = new ArrayList<String>();
        public static ArrayList<Long> traffic = new ArrayList<Long>();
        public static ArrayList<String> type = new ArrayList<String>();
        public static ArrayList<String> id = new ArrayList<String>();
    
        public static void cleanData() throws IOException {
            String str;
            File f = new File("/home/ryq1998/Documents/Tencent Files/316703799/FileRecv/result.txt");
            BufferedReader bf = new BufferedReader(new FileReader(f));
            try {
                while ((str = bf.readLine()) != null) {
                    String[] s = str.split(",");
                    ip.add(s[0]);
                    String[] newdate = s[1].split("\\|\:|\b|\+");
                    date.add(newdate[4] + "-" + "11" + "-" + newdate[0] + " " + newdate[6] + ":" + newdate[8] + ":"
                            + newdate[10]);
                    day.add(s[2]);
                    String[] newtriffic = s[3].split(" ");
                    traffic.add(Long.parseLong(newtriffic[0]));
                    type.add(s[4]);
                    id.add(s[5]);
                }
            } catch (FileNotFoundException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            } finally {
                bf.close();
                returnText(ip, date, day, traffic, type, id);
                /*
                 * 将数据插入mysql数据库
                 */
                /*addSql(ip, date, day, traffic, type, id);*/
                
            }
        }
    
        /*
         * 存储mysql数据库
         */
        public static void addSql(ArrayList<String> ip, ArrayList<String> date, ArrayList<String> day,
                ArrayList<Long> traffic, ArrayList<String> type, ArrayList<String> id) {
    
            Connection con = null;
            try {
                con = JdbcUtils.getConnection();
                PreparedStatement psql;
                for (int i = 0; i < ip.size(); i++) {
                    psql = con.prepareStatement(
                            "insert into CleanData(ip,date,day,traffic,type,id) " + "values(?,?,?,?,?,?)");
                    psql.setString(1, ip.get(i));
                    psql.setString(2, date.get(i));
                    psql.setString(3, day.get(i));
                    psql.setLong(4, traffic.get(i));
                    psql.setString(5, type.get(i));
                    psql.setString(6, id.get(i));
                    psql.executeUpdate();
                    psql.close();
                }
                con.close();
            } catch (Exception e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
    
        }
        
        public static void returnText(ArrayList<String> ip, ArrayList<String> date, ArrayList<String> day,
                ArrayList<Long> traffic, ArrayList<String> type, ArrayList<String> id) {
            
            FileWriter fileWriter = null;
            try {
                fileWriter = new FileWriter("/home/ryq1998/result.txt");//创建文本文件
                int i=0;
                for(;i<ip.size();i++) {
                    if(i==ip.size()-1) {
                        fileWriter.write(ip.get(i)+" "+date.get(i)+" "+day.get(i)+" "+traffic.get(i)+" "+type.get(i)+" "+id.get(i));
                        break;
                    }
                    fileWriter.write(ip.get(i)+" "+date.get(i)+" "+day.get(i)+" "+traffic.get(i)+" "+type.get(i)+" "+id.get(i)+"
    ");//写入 
    换行
                }
                fileWriter.flush();
                fileWriter.close();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            
            
            
        }
    
        public static void main(String[] args) throws IOException {
            cleanData();
        }
    
    }
    View Code

      截图如下

      

  • 相关阅读:
    Mybatis入门
    Spring的xml文件配置方式实现AOP
    jquery简直是太酷炫强大了
    [Google Guava] 2.2-新集合类型
    小规模的流处理框架.Part 1: thread pools
    数据库三大范式和五大约束
    Hibernate:缓存
    MyBatis:缓存配置
    Python:协程
    微信公众号开发之测试账号
  • 原文地址:https://www.cnblogs.com/YXSZ/p/11854005.html
Copyright © 2011-2022 走看看