zoukankan html css js c++ java

一个mapreduce得到需要计算单词概率的基础数据

第一步，先计算需要计算概率的词频，单词种类数，类别单词总数（类别均是按照文件夹名区分）（基础数据以及分词了，每个单词一行，以及预处理好）

package org.lukey.hadoop.classifyBayes;

import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Counters;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;

/**
 * 
 * 一次将需要的结果都统计到对应的文件夹中 AFRICA 484017newsML.txt afford 1
 * 
 * 按照这个格式输出给后面处理得到需要的： 1. AFRICA 484017newsML.txt AFRICA 487141newsML.txt
 * 类别中的文本数， ---> 计算先验概率(单独解决这个) 所有类别中的文本总数， ---> 可以由上面得到，计算先验概率
 * 
 * 2. AFRICA afford 1 AFRICA boy 3 每个类中的每个单词的个数，---> 计算各个类中单词的概率
 * 
 * 3. AFRICA 768 类中单词总数， ---> 将2中的第一个key相同的第三个数相加即可
 * 
 * 4. AllWORDS 12345 所有类别中单词种类数 ---> 将1中的第三个key归并，计算个数
 *
 */

public class MyWordCount {

    private static MultipleOutputs<Text, IntWritable> mos;
    static String baseOutputPath = "/user/hadoop/test_out";

    // 设计两个map分别计算每个类别的文本数//和每个类别的单词总数
    private static Map<String, List<String>> fileCountMap = new HashMap<String, List<String>>();
    private static Map<String, Integer> fileCount = new HashMap<String, Integer>();
    // static Map<String, List<String>> wordsCountInClassMap = new
    // HashMap<String, List<String>>();

    static enum WordsNature {
        CLSASS_NUMBER, CLASS_WORDS, TOTALWORDS
    }

    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();

        // 设置不同文件的路径
        // 文本数路径
        String priorProbality = "hdfs://192.168.190.128:9000/user/hadoop/output/priorP/priorProbality.txt";
        conf.set("priorProbality", priorProbality);

        String[] otherArgs = { "/user/hadoop/input/NBCorpus/Country", "/user/hadoop/mid/wordsFre" };

        Job job = new Job(conf, "file count");

        job.setJarByClass(MyWordCount.class);

        // job.setInputFormatClass(CustomInputFormat.class);

        job.setMapperClass(First_Mapper.class);
        job.setReducerClass(First_Reducer.class);

        //过滤掉文本数少于10的类别
        List<Path> inputPaths = getSecondDir(conf, otherArgs[0]);
        for (Path path : inputPaths) {
            FileInputFormat.addInputPath(job, path);
        }

        // 调用自己写的方法
//        MyUtils.addInputPath(job, inputpath, conf);
        // CustomInputFormat.setInputPaths(job, inputpath);
        // FileInputFormat.addInputPath(job, inputpath);
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        int exitCode = job.waitForCompletion(true) ? 0 : 1;

        // 调用计数器
        Counters counters = job.getCounters();
        Counter c1 = counters.findCounter(WordsNature.TOTALWORDS);
        System.out.println("-------------->>>>: " + c1.getDisplayName() + ":" + c1.getName() + ": " + c1.getValue());

        // 将单词种类数写入文件中
        Path totalWordsPath = new Path("/user/hadoop/output/totalwords.txt");
        FileSystem fs = FileSystem.get(conf);
        FSDataOutputStream outputStream = fs.create(totalWordsPath);
        outputStream.writeBytes(c1.getDisplayName() + ":" + c1.getValue());
        IOUtils.closeStream(outputStream);


        
        
        // 下次求概率是尝试单词总种类数写到configuration中
        //
        // conf.set("TOTALWORDS", totalWords.toString());

        System.exit(exitCode);

    }

    // Mapper
    static class First_Mapper extends Mapper<LongWritable, Text, Text, IntWritable> {

        private final static IntWritable one = new IntWritable(1);
        private final static IntWritable zero = new IntWritable(0);

        private Text className = new Text();
        private Text countryName = new Text();

        @Override
        protected void cleanup(Mapper<LongWritable, Text, Text, IntWritable>.Context context)
                throws IOException, InterruptedException {
            Configuration conf = context.getConfiguration();
            String file = conf.get("priorProbality");
            FileSystem fs = FileSystem.get(URI.create(file), conf);
            Path priorPath = new Path(file);
            FSDataOutputStream priorStream = fs.create(priorPath);
            for (Map.Entry<String, List<String>> entry : fileCountMap.entrySet()) {
                fileCount.put(entry.getKey(), entry.getValue().size());
                priorStream.writeBytes(entry.getKey() + "	" + entry.getValue().size());
            }

            // 求文本总数
            int fileSum = 0;
            for (Integer num : fileCount.values()) {
                fileSum += num;
            }
            System.out.println("fileSum = " + fileSum);

            // 计算每个类的先验概率并写入文件
            for (Map.Entry<String, Integer> entry : fileCount.entrySet()) {
                double p = (double) entry.getValue() / fileSum;
                priorStream.writeBytes(entry.getKey() + ":" + p);
            }
            IOUtils.closeStream(priorStream);

        }

        
        @Override
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
                throws IOException, InterruptedException {
            // TODO Auto-generated method stub
            FileSplit fileSplit = (FileSplit) context.getInputSplit();

            // 文件名
            String fileName = fileSplit.getPath().getName();

            // 文件夹名(即类别名)
            String dirName = fileSplit.getPath().getParent().getName();

            className.set(dirName + "	" + value.toString());
            countryName.set(dirName + "	" + fileName + "	" + value.toString());

            // 将文件名添加到map中用于统计文本个数（单独跑了一个程序计算主要还是为了筛选文本数太少的类别）
            if (fileCountMap.containsKey(dirName)) {
                if (!fileCountMap.get(dirName).contains(fileName)) {
                    fileCountMap.get(dirName).add(fileName);
                }
            } else {
                List<String> oneList = new ArrayList<String>();
                oneList.add(fileName);
                fileCountMap.put(dirName, oneList);
            }

            context.write(className, one); // 每个类别的每个单词数 // ABDBI hello 1
            context.write(new Text(dirName), one);// 统计每个类中的单词总数 //ABDBI 1
            context.write(value, zero); // 用于统计所有类中单词个数

        }
    }

    // Reducer
    static class First_Reducer extends Reducer<Text, IntWritable, Text, IntWritable> {

        // result 表示每个类别中每个单词的个数
        IntWritable result = new IntWritable();
        Map<String, List<String>> classMap = new HashMap<String, List<String>>();
        Map<String, List<String>> fileMap = new HashMap<String, List<String>>();

        @Override
        protected void reduce(Text key, Iterable<IntWritable> values,
                Reducer<Text, IntWritable, Text, IntWritable>.Context context)
                        throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable value : values) {
                sum += value.get();
            }

            // sum为0，总得单词数加1，统计所有单词的种类
            if (sum == 0) {
                context.getCounter(WordsNature.TOTALWORDS).increment(1);
            } else {// sum不为0时，通过key的长度来判断，
                String[] temp = key.toString().split("	");
                if (temp.length == 2) { // 用tab分隔类别和单词
                    result.set(sum);
                    context.write(key, result);
                    // mos.write(new Text(temp[1]), result, temp[0]);
                } else { // 类别中单词总数
                    result.set(sum);
                    mos.write(key, result, "wordsInClass");
                }

            }

        }

        @Override
        protected void cleanup(Reducer<Text, IntWritable, Text, IntWritable>.Context context)
                throws IOException, InterruptedException {
            // TODO Auto-generated method stub
            mos.close();
        }

        @Override
        protected void setup(Reducer<Text, IntWritable, Text, IntWritable>.Context context)
                throws IOException, InterruptedException {
            // TODO Auto-generated method stub
            mos = new MultipleOutputs<Text, IntWritable>(context);
        }

    }
    
    
    // 获取文件夹下面二级文件夹路径的方法
        static List<Path> getSecondDir(Configuration conf, String folder) throws Exception {
            FileSystem fs = FileSystem.get(conf);
            Path path = new Path(folder);
            FileStatus[] stats = fs.listStatus(path);
            List<Path> folderPath = new ArrayList<Path>();
            for (FileStatus stat : stats) {
                if (stat.isDir()) {
                    if (fs.listStatus(stat.getPath()).length > 10) {    //筛选出文件数大于10个的类别作为 输入路径
                        folderPath.add(stat.getPath());
                    }
                }
            }
            return folderPath;
        }


}

View Code

第二步，计算每个类别单词的概率，需提前读取每个类别单词总数，以及总得单词种类数（都可以通过configuration.set）也可以在setup里面先于map处理前读取数据。

package org.lukey.hadoop.classifyBayes;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;

public class Probability {

    private static final Log LOG = LogFactory.getLog(FileInputFormat.class);
    public static int total = 0;
    private static MultipleOutputs<Text, DoubleWritable> mos;

    // Client
    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();
        conf.set("mapred.job.tracker", "192.168.190.128:9001");
        conf.set("mapred.jar", "probability.jar");
        // 读取单词总数，设置到congfiguration中
        String totalWordsPath = "hdfs://192.168.190.128:9000/user/hadoop/output/totalwords.txt";
        String wordsInClassPath = "hdfs://192.168.190.128:9000/user/hadoop/mid/wordsFrequence/wordsInClass-r-00000";

        conf.set("wordsInClassPath", wordsInClassPath);
        // Map<String, Integer> wordsInClassMap = new HashMap<String,
        // Integer>();//保存每个类别的单词总数

        // 先读取单词总类别数
        FileSystem fs = FileSystem.get(URI.create(totalWordsPath), conf);
        FSDataInputStream inputStream = fs.open(new Path(totalWordsPath));
        BufferedReader buffer = new BufferedReader(new InputStreamReader(inputStream));
        String strLine = buffer.readLine();
        String[] temp = strLine.split(":");
        if (temp.length == 2) {
            // temp[0] = TOTALWORDS
            conf.set(temp[0], temp[1]);// 设置两个String
        }

        total = Integer.parseInt(conf.get("TOTALWORDS"));
        LOG.info("------>total = " + total);

        System.out.println("total ==== " + total);
        /*
         * String[] otherArgs = new GenericOptionsParser(conf,
         * args).getRemainingArgs();
         * 
         * if (otherArgs.length != 2) { System.out.println("Usage <in> <out>");
         * System.exit(-1); }
         */
        Job job = new Job(conf, "file count");

        job.setJarByClass(Probability.class);

        job.setMapperClass(WordsOfClassCountMapper.class);
        job.setReducerClass(WordsOfClassCountReducer.class);

        String input = "hdfs://192.168.190.128:9000/user/hadoop/mid/wordsFrequence";
        String output = "hdfs://192.168.190.128:9000/user/hadoop/output/probability/";

        FileInputFormat.addInputPath(job, new Path(input));
        FileOutputFormat.setOutputPath(job, new Path(output));

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(DoubleWritable.class);

        System.exit(job.waitForCompletion(true) ? 0 : 1);

    }

    // Mapper
    static class WordsOfClassCountMapper extends Mapper<LongWritable, Text, Text, DoubleWritable> {

        private static DoubleWritable number = new DoubleWritable();
        private static Text className = new Text();

        // 保存类别中单词总数
        private static Map<String, Integer> filemap = new HashMap<String, Integer>();

        protected void map(LongWritable key, Text value,
                Mapper<LongWritable, Text, Text, DoubleWritable>.Context context)
                        throws IOException, InterruptedException {
            Configuration conf = context.getConfiguration();
            int tot = Integer.parseInt(conf.get("TOTALWORDS"));

            System.out.println("total = " + total);
            System.out.println("tot = " + tot);

            // 输入的格式如下：
            // ALB weekend 1
            // ALB weeks 3
            Map<String, Map<String, Integer>> baseMap = new HashMap<String, Map<String, Integer>>(); // 保存基础数据
            // Map<String, Map<String, Double>> priorMap = new HashMap<String,
            // Map<String, Double>>(); // 保存每个单词出现的概率

            String[] temp = value.toString().split("	");
            // 先将数据存到baseMap中
            if (temp.length == 3) {
                // 文件夹名类别名
                if (baseMap.containsKey(temp[0])) {
                    baseMap.get(temp[0]).put(temp[1], Integer.parseInt(temp[2]));
                } else {
                    Map<String, Integer> oneMap = new HashMap<String, Integer>();
                    oneMap.put(temp[1], Integer.parseInt(temp[2]));
                    baseMap.put(temp[0], oneMap);
                }

            } // 读取数据完毕，全部保存在baseMap中

            int allWordsInClass = 0;
            

            for (Map.Entry<String, Map<String, Integer>> entries : baseMap.entrySet()) { // 遍历类别
                allWordsInClass = filemap.get(entries.getKey());
                for (Map.Entry<String, Integer> entry : entries.getValue().entrySet()) { // 遍历类别中的单词词频求概率
                    double p = (entry.getValue() + 1.0) / (allWordsInClass + tot);

                    className.set(entries.getKey() + "	" + entry.getKey());
                    number.set(p);
                    LOG.info("------>p = " + p);

                    context.write(className, number);
                }
            }

        }

        protected void cleanup(Mapper<LongWritable, Text, Text, DoubleWritable>.Context context)
                throws IOException, InterruptedException {
            // TODO Auto-generated method stub
            mos.close();
        }

        protected void setup(Mapper<LongWritable, Text, Text, DoubleWritable>.Context context)
                throws IOException, InterruptedException {
            // TODO Auto-generated method stub
            Configuration conf = context.getConfiguration();
            mos = new MultipleOutputs<Text, DoubleWritable>(context);
            String filePath = conf.get("wordsInClassPath");
            FileSystem fs = FileSystem.get(URI.create(filePath), conf);
            FSDataInputStream inputStream = fs.open(new Path(filePath));
            BufferedReader buffer = new BufferedReader(new InputStreamReader(inputStream));
            String strLine = null;
            while ((strLine = buffer.readLine()) != null) {
                String[] temp = strLine.split("	");
                filemap.put(temp[0], Integer.parseInt(temp[1]));
            }
        }

    }

    // Reducer
    static class WordsOfClassCountReducer extends Reducer<Text, DoubleWritable, Text, DoubleWritable> {

        // result 表示每个文件里面单词个数
        DoubleWritable result = new DoubleWritable();
        // Configuration conf = new Configuration();
        // int total = conf.getInt("TOTALWORDS", 1);

        protected void reduce(Text key, Iterable<DoubleWritable> values,
                Reducer<Text, DoubleWritable, Text, DoubleWritable>.Context context)
                        throws IOException, InterruptedException {

            double sum = 0L;
            for (DoubleWritable value : values) {
                sum += value.get();
            }
            result.set(sum);

            context.write(key, result);
        }

    }

}

View Code

查看全文

相关阅读:
hdu 5387 Clock (模拟)
CodeForces 300B Coach （并查集）
hdu 3342 Legal or Not（拓扑排序）
hdu 3853 LOOPS（概率DP）
hdu 3076 ssworld VS DDD（概率dp）
csu 1120 病毒(LICS 最长公共上升子序列）
csu 1110 RMQ with Shifts （线段树单点更新）
poj 1458 Common Subsequence（最大公共子序列）
poj 2456 Aggressive cows （二分）
HDU 1869 六度分离（floyd）

原文地址：https://www.cnblogs.com/luolizhi/p/4944760.html