zoukankan      html  css  js  c++  java
  • 单词计数-MapReduceJob

    pom文件

    <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    	<modelVersion>4.0.0</modelVersion>
    
    	<groupId>com.zuoyan</groupId>
    	<artifactId>hadoop</artifactId>
    	<version>0.0.1-SNAPSHOT</version>
    	<packaging>jar</packaging>
    
    	<name>hadoop</name>
    	<url>http://maven.apache.org</url>
    
    	<properties>
    		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    	</properties>
    
    	<dependencies>
    		<dependency>
    			<groupId>junit</groupId>
    			<artifactId>junit</artifactId>
    			<version>3.8.1</version>
    		</dependency>
    		<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
    		<dependency>
    			<groupId>org.apache.hadoop</groupId>
    			<artifactId>hadoop-client</artifactId>
    			<version>3.0.0</version>
    		</dependency>
    		<!-- https://mvnrepository.com/artifact/com.janeluo/ikanalyzer -->
    		<dependency>
    		    <groupId>com.janeluo</groupId>
    		    <artifactId>ikanalyzer</artifactId>
    		    <version>2012_u6</version>
    		</dependency>
    	</dependencies>
    	<build>
    		<plugins>
    			<plugin>
    				<artifactId>maven-assembly-plugin</artifactId>
    				<configuration>
    					<appendAssemblyId>false</appendAssemblyId>
    					<descriptorRefs>
    						<descriptorRef>jar-with-dependencies</descriptorRef>
    					</descriptorRefs>
    					<archive>
    						<manifest>
    							<!-- 此处指定main方法入口的class -->
    							<mainClass>com.zuoyan.hadoop.FirstMapReduceJob</mainClass>
    <!-- 							<mainClass>com.geotmt.hadoop.hdfs.FirstMapReduceJob</mainClass> -->
    						</manifest>
    					</archive>
    				</configuration>
    				<executions>
    					<execution>
    						<id>make-assembly</id>
    						<phase>package</phase>
    						<goals>
    							<goal>assembly</goal>
    						</goals>
    					</execution>
    				</executions>
    			</plugin>
    			<plugin>
    				<groupId>org.apache.maven.plugins</groupId>
    				<artifactId>maven-compiler-plugin</artifactId>
    				<version>3.6.2</version>
    				<configuration>
    					<source>1.8</source>
    					<target>1.8</target>
    					<encoding>UTF-8</encoding>
    				</configuration>
    			</plugin>
    		</plugins>
    	</build>
    </project>
    

      

    单词计数-实现

    package com.zuoyan.hadoop;
    import java.io.ByteArrayInputStream;
    import java.io.IOException;
    import java.io.InputStream;
    import java.io.InputStreamReader;
    import java.io.Reader;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.Reducer;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    import org.apache.hadoop.util.GenericOptionsParser;
    import org.wltea.analyzer.core.IKSegmenter;
    import org.wltea.analyzer.core.Lexeme;
     
    /**
     * 单词计数
     *
     */
    public class FirstMapReduceJob {
     
        public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable>{
     
            private final static IntWritable one = new IntWritable(1);
            private Text word = new Text();
     
            public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
                /*
                 * 默认英文分词
                 * 
                StringTokenizer itr = new StringTokenizer(value.toString());
                while (itr.hasMoreTokens()) {
                    word.set(itr.nextToken());
                    context.write(word, one);
                }
                */
            	/*
            	 * 中文分词-使用IK分词器分词
            	 */
                byte[] bytes = value.getBytes();
                InputStream inputStream = new ByteArrayInputStream(bytes);
                Reader reader = new InputStreamReader(inputStream);
                IKSegmenter iKSegmenter = new IKSegmenter(reader,true);
                Lexeme t;
                while((t=iKSegmenter.next()) != null){
                	context.write(new Text(t.getLexemeText()), new IntWritable(1));
                }
                
                //方案二,获取文件信息
    //            context.getInputSplit().getLocationInfo();
                
            }
        }
     
        public static class IntSumReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
            private IntWritable result = new IntWritable();
     
            public void reduce(Text key, Iterable<IntWritable> values,Context context ) throws IOException, InterruptedException {
                int sum = 0;
                for (IntWritable val : values) {
                    sum += val.get();
                }
                result.set(sum);
                context.write(key, result);
            }
        }
     
        public static void main(String[] args) throws Exception {
            Configuration conf = new Configuration();
            String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
            if (otherArgs.length != 2) {
                System.err.println("Usage: wordcount <in> <out>");
                System.exit(2);
            }
            Job job = new Job(conf, "word count");
            job.setJarByClass(FirstMapReduceJob.class);
            job.setMapperClass(TokenizerMapper.class);
            job.setCombinerClass(IntSumReducer.class);
            job.setReducerClass(IntSumReducer.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class);
            FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
            FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
            System.exit(job.waitForCompletion(true) ? 0 : 1);
        }
    }
    

      

  • 相关阅读:
    【译】可扩展前端2  —  常见模式
    【译】可扩展前端1  —  架构基础
    【译】The Clean Architecture
    获取页面元素位置
    vue高价组件的使用
    gif动态图片转精灵图
    消除 transition 闪屏
    移动端 -- 如何去掉元素被触摸时产生的半透明灰色遮罩?
    解决手机移动端触屏版web页面长时间按住页面出现闪退的问题
    移动端滑动慢,卡顿
  • 原文地址:https://www.cnblogs.com/guoziyi/p/10282470.html
Copyright © 2011-2022 走看看