zoukankan      html  css  js  c++  java
  • 11月13日的实验进度

    还没有完成,主要是hive没有配置好。。。。。。。。程序的清洗已经做得差不多了,之前一直有出现数组溢出的情况,主要原因是我还没有理解mapreduce的工作模式。代码如下:

    import java.lang.String;
    import java.io.IOException;
    import java.util.*;
    import java.text.SimpleDateFormat;
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.Reducer;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
    import org.apache.hadoop.util.GenericOptionsParser;
    import org.apache.hadoop.io.NullWritable;
     
    public class Namecount {
     
             public static final SimpleDateFormat FORMAT = new SimpleDateFormat("d/MMM/yyyy:HH:mm:ss", Locale.ENGLISH); //原时间格式
             public static final SimpleDateFormat dateformat1 = new SimpleDateFormat("yyyy-MM-dd");//现时间格式
           private Date parseDateFormat(String string) {         //转换时间格式
                Date parse = null;
                try {
                    parse = FORMAT.parse(string);
                } catch (Exception e) {
                    e.printStackTrace();
                }
                return parse;
            }
            
            public String[] parse(String line) {
                public static ArrayList<String> ip = new ArrayList<String>();
        public static ArrayList<String> date = new ArrayList<String>();
        public static ArrayList<String> day = new ArrayList<String>();
        public static ArrayList<Long> traffic = new ArrayList<Long>();
        public static ArrayList<String> type = new ArrayList<String>();
        public static ArrayList<String> id = new ArrayList<String>();
     
                return new String[] { ip, time, url, status, traffic };
            } 
            private String parseTraffic(String line) {    //流量
                final String trim = line.substring(line.lastIndexOf(""") + 1)
                        .trim();
                String traffic = trim.split(" ")[1];
                return traffic;
            }
           private String parseStatus(String line) {     //状态
                final String trim = line.substring(line.lastIndexOf(""") + 1)
                        .trim();
                String status = trim.split(" ")[0];
                return status;
            }
     
            private String parseURL(String line) {       //url
                final int first = line.indexOf(""");
                final int last = line.lastIndexOf(""");
                String url = line.substring(first + 1, last);
                return url;
            }
            private String parseTime(String line) {    //时间
                final int first = line.indexOf("[");
                final int last = line.indexOf("+0800]");
                String time = line.substring(first + 1, last).trim();
                Date date = parseDateFormat(time);
                return dateformat1.format(date);
            }
            private String parseIP(String line) {     //ip
                String ip = line.split("- -")[0].trim();
                return ip;
            }
        public static class Map extends
                Mapper<LongWritable, Text, Text, IntWritable> {
                    
            public void map(LongWritable key, Text value, Context context)
                    throws IOException, InterruptedException {
                // 将输入的纯文本文件的数据转化成String
                Text outputValue = new Text();
                String line = value.toString();
                 Namecount aa=new Namecount();
                StringTokenizer tokenizerArticle = new StringTokenizer(line, "
    ");
     
                // 分别对每一行进行处理
                while (tokenizerArticle.hasMoreElements()) {
                    // 每行按空格划分
                  String stra=tokenizerArticle.nextToken().toString();
                  String [] Newstr=aa.parse(stra);
     
               if (Newstr[2].startsWith("GET /")) { //过滤开头字符串
                    Newstr[2] = Newstr[2].substring("GET /".length());
                } 
              else if (Newstr[2].startsWith("POST /")) {
                    Newstr[2] = Newstr[2].substring("POST /".length());
                }
               if (Newstr[2].endsWith(" HTTP/1.1")) { //过滤结尾字符串
                    Newstr[2] = Newstr[2].substring(0, Newstr[2].length()
                            - " HTTP/1.1".length());
                }
                  String[] words = Newstr[2].split("/");
                  if(words.length==4){
                      outputValue.set(Newstr[0] + "	" + Newstr[1] + "	" + words[0]+"	"+words[1]+"	"+words[2]+"	"+words[3]+"	"+"0");
                       context.write(outputValue,new IntWritable(1));                 
    }    
        }
      }
    }
     
        public static class Reduce extends
                Reducer<Text, IntWritable, Text, IntWritable> {
            // 实现reduce函数
            public void reduce(Text key, Iterable<IntWritable> values,
                    Context context) throws IOException, InterruptedException {
              int sum = 0;
                Iterator<IntWritable> iterator = values.iterator();
                while (iterator.hasNext()) {
                    sum += iterator.next().get();
                }
                context.write(key, new IntWritable(sum));
            }
        }
        public static void main(String[] args) throws Exception {
            Configuration conf = new Configuration();
            
        conf.set("mapred.jar","Namecount.jar");
     
            String[] ioArgs = new String[] { "name", "name_out" };
            String[] otherArgs = new GenericOptionsParser(conf, ioArgs).getRemainingArgs();
            if (otherArgs.length != 2) {
                System.err.println("Usage: Score Average <in> <out>");
                System.exit(2);
            }
     
            Job job = new Job(conf, "name_goods_count");
            job.setJarByClass(Namecount.class);
     
            // 设置Map、Combine和Reduce处理类
            job.setMapperClass(Map.class);
            job.setCombinerClass(Reduce.class);
            job.setReducerClass(Reduce.class);
     
            // 设置输出类型
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class);
     
            // 将输入的数据集分割成小数据块splites,提供一个RecordReder的实现
            job.setInputFormatClass(TextInputFormat.class);
            // 提供一个RecordWriter的实现,负责数据输出
            job.setOutputFormatClass(TextOutputFormat.class);
     
            // 设置输入和输出目录
            Path in=new Path("hdfs://localhost:9000/mymapreduce3/123/12345.txt");  
            Path out=new Path("hdfs://localhost:9000/mymapreduce3/out");  
            FileInputFormat.addInputPath(job,in);  
            FileOutputFormat.setOutputPath(job,out);  
        }

    如此,还有一点小错误,明天应该可以完成生于部分以及导入hive了

  • 相关阅读:
    linux 解压tgz 文件指令
    shell 脚本没有执行权限 报错 bash: ./myshell.sh: Permission denied
    linux 启动solr 报错 Your Max Processes Limit is currently 31202. It should be set to 65000 to avoid operational disruption.
    远程查询批量导入数据
    修改 MZTreeView 赋权节点父节点选中子节点自动选中的问题
    关于乱码的问题解决记录
    我的网站优化之路
    对设计及重构的一点反思
    我的五年岁月
    奔三的路上
  • 原文地址:https://www.cnblogs.com/jyt123/p/11852158.html
Copyright © 2011-2022 走看看