zoukankan      html  css  js  c++  java
  • MapReduce+hive数据操作-------第一天

      题目:现有一文本文件,要将其中的数据进行清洗,以及存入hive数据库,在进行相关的数据统计。

      这是要求我们使用mapReuce进行数据清洗,以及进行数据的统计。作为一名mapreduce的初学者,对于mapreduce的原理还不是很清楚。这是我使用Java进行数据清洗,在进行数据库的录入。

      上代码:

      Java数据清洗代码:

    package Data;
    
    import java.io.BufferedReader;
    import java.io.BufferedWriter;
    import java.io.File;
    import java.io.FileNotFoundException;
    import java.io.FileOutputStream;
    import java.io.FileReader;
    import java.io.IOException;
    import java.io.OutputStreamWriter;
    import java.io.UnsupportedEncodingException;
    import java.io.Writer;
    
    public class Data {
    
        public static void main(String[] args) throws FileNotFoundException, UnsupportedEncodingException {
             FileReader read = new FileReader("result.txt");
               BufferedReader br = new BufferedReader(read);
               Writer writer = null;
               File outFile = new File("result2.txt");
               writer = new OutputStreamWriter(new FileOutputStream(outFile),"utf-8");
               BufferedWriter bw = new BufferedWriter(writer);
               String row;
               String[] data=new String[6];
               int hang=1;
               try {
                while((row = br.readLine())!=null){
                        data=change(row);
                        data=chage(data);
                        for(int i=0;i<data.length;i++) {
                            System.out.print(data[i]+"	");
                        }
                        System.out.println();
                        row=data[0]+","+data[1]+","+data[2]+","+data[3]+","+data[4]+","+data[5];
                        bw.write(row + "
    ");
                        //i++;
                       }
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
    
        }
    
        private static String[] chage(String[] data) {
            /*
             * for(int i=0;i<data.length;i++) { data[] }
             */
            data[0]=data[0];
            char[] str=data[1].toCharArray();
            String[] time=new String[7];
            int j=0;
            int k=0;
            for(int i=0;i<str.length;i++) {
                if(str[i]=='/'||str[i]==':'||str[i]==32) {
                    time[k]=data[1].substring(j,i);
                    j=i+1;
                    k++;
                }
            }
            time[k]=data[1].substring(j, data[1].length());
            
             switch(time[1]) { case "Jan":time[1]="01";break; case
              "Feb":time[1]="02";break; case "Mar":time[1]="03";break; case
              "Apr":time[1]="04";break; case "May":time[1]="05";break; case
              "Jun":time[1]="06";break; case "Jul":time[1]="07";break; case
              "Aug":time[1]="08";break; case "Sep":time[1]="09";break; case
              "Oct":time[1]="10";break; case "Nov":time[1]="11";break; case
              "Dec":time[1]="12";break; }
             
            data[1]=time[2]+"-"+time[1]+"-"+time[0]+" "+time[3]+":"+time[4]+":"+time[5];
            data[3]=data[3].substring(0, data[3].length()-1);
            return data;
        }
    
        private static String [] change(String row) {
            char [] str1=row.toCharArray();
            String [] data =new String [6];    
            int j=0;
            int k=0;
            for(int i=0;i<str1.length;i++) {
                if(str1[i]==',') {
                    data[k]=row.substring(j, i);
                    j=i+1;
                    k++;
                }    
            }
            data[k]=row.substring(j, str1.length);
            return data;
        }
    
    }
    View Code

      上传到数据库代码:

    package Hive;
    
    import java.sql.Connection;
    import java.sql.DriverManager;
    import java.sql.ResultSet;
    import java.sql.SQLException;
    import java.sql.Statement;
    
    import org.apache.log4j.Logger;
    
    public class Data {
        private static String driverName = "org.apache.hive.jdbc.HiveDriver";
        private static String url = "jdbc:hive2://192.168.43.18:10000/text";
        private static String user = "hive";
        private static String password = "hive";
        private static String sql;
        //private static ResultSet res;
        private static final Logger log = Logger.getLogger(Text.class);
     
        public static void main(String[] args) {
            try {
                Class.forName(driverName);
                Connection conn = DriverManager.getConnection(url, user, password);
                Statement stmt = conn.createStatement();
     
                sql = "load data local inpath '/home/hadoop/下载/result2.txt' overwrite into table data";//显示全部表
                System.out.println("Running:" + sql);
                boolean f=stmt.execute(sql);
                System.out.println("显示结果:" + sql);
                System.out.println("result:" + f);
     
                conn.close();
                conn = null;
            } catch (ClassNotFoundException e) {
                e.printStackTrace();
                log.error(driverName + " not found!", e);
                System.exit(1);
            } catch (SQLException e) {
                e.printStackTrace();
                log.error("Connection error!", e);
                System.exit(1);
            }
     
        }
    
    }
    Data

    截图:

      

     目前遇到的问题:

      一:对于BufferedWriter,在数据的最后阶段会有一部分的数据录入不进去,这作为一个问题之后进行相关的探索。

      二:对于Map Reduce还是不是很熟悉,以及不知道相关的原理。

  • 相关阅读:
    Excel电子表格操作
    word文档编辑
    中英文输入
    个人借款合同范本
    Day-8:汇总数据
    Day-7:使用函数处理数据
    Day-6:创建计算字段
    Day-5:通配符过滤
    Day-4:高级数据过滤
    Day-3:过滤数据
  • 原文地址:https://www.cnblogs.com/huan-ch/p/11852988.html
Copyright © 2011-2022 走看看