zoukankan      html  css  js  c++  java
  • 11月13日的实验进度

    还没有完成,主要是hive没有配置好。。。。。。。。程序的清洗已经做得差不多了,之前一直有出现数组溢出的情况,主要原因是我还没有理解mapreduce的工作模式。代码如下:

    import java.lang.String;
    import java.io.IOException;
    import java.util.*;
    import java.text.SimpleDateFormat;
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.Reducer;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
    import org.apache.hadoop.util.GenericOptionsParser;
    import org.apache.hadoop.io.NullWritable;
     
    public class Namecount {
     
             public static final SimpleDateFormat FORMAT = new SimpleDateFormat("d/MMM/yyyy:HH:mm:ss", Locale.ENGLISH); //原时间格式
             public static final SimpleDateFormat dateformat1 = new SimpleDateFormat("yyyy-MM-dd");//现时间格式
           private Date parseDateFormat(String string) {         //转换时间格式
                Date parse = null;
                try {
                    parse = FORMAT.parse(string);
                } catch (Exception e) {
                    e.printStackTrace();
                }
                return parse;
            }
            
            public String[] parse(String line) {
                public static ArrayList<String> ip = new ArrayList<String>();
        public static ArrayList<String> date = new ArrayList<String>();
        public static ArrayList<String> day = new ArrayList<String>();
        public static ArrayList<Long> traffic = new ArrayList<Long>();
        public static ArrayList<String> type = new ArrayList<String>();
        public static ArrayList<String> id = new ArrayList<String>();
     
                return new String[] { ip, time, url, status, traffic };
            } 
            private String parseTraffic(String line) {    //流量
                final String trim = line.substring(line.lastIndexOf(""") + 1)
                        .trim();
                String traffic = trim.split(" ")[1];
                return traffic;
            }
           private String parseStatus(String line) {     //状态
                final String trim = line.substring(line.lastIndexOf(""") + 1)
                        .trim();
                String status = trim.split(" ")[0];
                return status;
            }
     
            private String parseURL(String line) {       //url
                final int first = line.indexOf(""");
                final int last = line.lastIndexOf(""");
                String url = line.substring(first + 1, last);
                return url;
            }
            private String parseTime(String line) {    //时间
                final int first = line.indexOf("[");
                final int last = line.indexOf("+0800]");
                String time = line.substring(first + 1, last).trim();
                Date date = parseDateFormat(time);
                return dateformat1.format(date);
            }
            private String parseIP(String line) {     //ip
                String ip = line.split("- -")[0].trim();
                return ip;
            }
        public static class Map extends
                Mapper<LongWritable, Text, Text, IntWritable> {
                    
            public void map(LongWritable key, Text value, Context context)
                    throws IOException, InterruptedException {
                // 将输入的纯文本文件的数据转化成String
                Text outputValue = new Text();
                String line = value.toString();
                 Namecount aa=new Namecount();
                StringTokenizer tokenizerArticle = new StringTokenizer(line, "
    ");
     
                // 分别对每一行进行处理
                while (tokenizerArticle.hasMoreElements()) {
                    // 每行按空格划分
                  String stra=tokenizerArticle.nextToken().toString();
                  String [] Newstr=aa.parse(stra);
     
               if (Newstr[2].startsWith("GET /")) { //过滤开头字符串
                    Newstr[2] = Newstr[2].substring("GET /".length());
                } 
              else if (Newstr[2].startsWith("POST /")) {
                    Newstr[2] = Newstr[2].substring("POST /".length());
                }
               if (Newstr[2].endsWith(" HTTP/1.1")) { //过滤结尾字符串
                    Newstr[2] = Newstr[2].substring(0, Newstr[2].length()
                            - " HTTP/1.1".length());
                }
                  String[] words = Newstr[2].split("/");
                  if(words.length==4){
                      outputValue.set(Newstr[0] + "	" + Newstr[1] + "	" + words[0]+"	"+words[1]+"	"+words[2]+"	"+words[3]+"	"+"0");
                       context.write(outputValue,new IntWritable(1));                 
    }    
        }
      }
    }
     
        public static class Reduce extends
                Reducer<Text, IntWritable, Text, IntWritable> {
            // 实现reduce函数
            public void reduce(Text key, Iterable<IntWritable> values,
                    Context context) throws IOException, InterruptedException {
              int sum = 0;
                Iterator<IntWritable> iterator = values.iterator();
                while (iterator.hasNext()) {
                    sum += iterator.next().get();
                }
                context.write(key, new IntWritable(sum));
            }
        }
        public static void main(String[] args) throws Exception {
            Configuration conf = new Configuration();
            
        conf.set("mapred.jar","Namecount.jar");
     
            String[] ioArgs = new String[] { "name", "name_out" };
            String[] otherArgs = new GenericOptionsParser(conf, ioArgs).getRemainingArgs();
            if (otherArgs.length != 2) {
                System.err.println("Usage: Score Average <in> <out>");
                System.exit(2);
            }
     
            Job job = new Job(conf, "name_goods_count");
            job.setJarByClass(Namecount.class);
     
            // 设置Map、Combine和Reduce处理类
            job.setMapperClass(Map.class);
            job.setCombinerClass(Reduce.class);
            job.setReducerClass(Reduce.class);
     
            // 设置输出类型
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class);
     
            // 将输入的数据集分割成小数据块splites,提供一个RecordReder的实现
            job.setInputFormatClass(TextInputFormat.class);
            // 提供一个RecordWriter的实现,负责数据输出
            job.setOutputFormatClass(TextOutputFormat.class);
     
            // 设置输入和输出目录
            Path in=new Path("hdfs://localhost:9000/mymapreduce3/123/12345.txt");  
            Path out=new Path("hdfs://localhost:9000/mymapreduce3/out");  
            FileInputFormat.addInputPath(job,in);  
            FileOutputFormat.setOutputPath(job,out);  
        }

    如此,还有一点小错误,明天应该可以完成生于部分以及导入hive了

  • 相关阅读:
    【转】Android系统中Fastboot和Recovery所扮演的角色。
    【转】Android ROM分析(1):刷机原理及方法
    【转】ANDROIDROM制作(一)——ROM结构介绍、精简和内置、一般刷机过程
    【转】使用fastboot命令刷机流程详解
    检测是否安装或者开启flash
    CentOS中/英文环境切换教程(CentOS6.8)
    id: cannot find name for user ID xxx处理办法
    linux重命名所有find查找到的文件/文件夹
    linux过滤旧文件中的空行和注释行剩余内容组成新文件
    CentOS和AIX查看系统序列号
  • 原文地址:https://www.cnblogs.com/jyt123/p/11852158.html
Copyright © 2011-2022 走看看