zoukankan      html  css  js  c++  java
  • MapReduce 笔记

    以最简单的统计词频为例,我们只需要简单的写两个函数,就可以搭建起一个简单的服务集群

    (1) Map和Reduce 函数

    (2)MapReduceSpecification函数( 貌似有专门针对C++的函数库)

    【1】MapReduce研究探讨体会

    下面的这个连接是对谷歌《MapReduce: Simplified Data Processing on Large Clusters》论文的翻译

    【2】MapReduce超大集群的简单数据处理

    Java环境下对MapReduce的设置

    【3】http://blog.csdn.net/xiaotom5/article/details/8074791

    下面是统计词频的源代码

    1 #include "mapreduce/mapreduce.h"
    2 
    3 // User's map function
    4 class WordCounter : public Mapper {
    5    public:
    6        virtual void Map(const MapInput& input) {
    7            const string& text = input.value();
    8            const int n = text.size();
    9            for (int i = 0; i < n; ) {
    10                // Skip past leading whitespace
    11                while ((i < n) && isspace(text))
    12                    i++;
    13 
    14            // Find word end
    15            int start = i;
    16            while ((i < n) && !isspace(text))
    17                i++;
    18            if (start < i)
    19                Emit(text.substr(start,i-start),"1");
    20        }
    21    }
    22 };
    23 
    24 REGISTER_MAPPER(WordCounter);
    25 
    26 // User's reduce function
    27 class Adder : public Reducer {
    28    virtual void Reduce(ReduceInput* input) {
    29        // Iterate over all entries with the
    30        // same key and add the values
    31        int64 value = 0;
    32        while (!input->done()) {
    33            value += StringToInt(input->value());
    34            input->NextValue();
    35        }
    36 
    37        // Emit sum for input->key()
    38        Emit(IntToString(value));
    39    }
    40 };
    41 
    42 REGISTER_REDUCER(Adder);
    43 
    44 int main(int argc, char** argv) {
    45    ParseCommandLineFlags(argc, argv);
    46    
    47    MapReduceSpecification spec;
    48    
    49    // Store list of input files into "spec"
    50    for (int i = 1; i < argc; i++) {
    51        MapReduceInput* input = spec.add_input();
    52        input->set_format("text");
    53        input->set_filepattern(argv);
    54        input->set_mapper_class("WordCounter");
    55    }
    56 
    57    // Specify the output files:
    58    // /gfs/test/freq-00000-of-00100
    59    // /gfs/test/freq-00001-of-00100
    60    // 
    61    MapReduceOutput* out = spec.output();
    62    out->set_filebase("/gfs/test/freq");
    63    out->set_num_tasks(100);
    64    out->set_format("text");
    65    out->set_reducer_class("Adder");
    66    
    67    // Optional: do partial sums within map
    68    // tasks to save network bandwidth
    69    out->set_combiner_class("Adder");
    70 
    71    // Tuning parameters: use at most 2000
    72    // machines and 100 MB of memory per task
    73    spec.set_machines(2000);
    74    spec.set_map_megabytes(100);
    75    spec.set_reduce_megabytes(100);
    76    
    77    // Now run it
    78    MapReduceResult result;
    79    if (!MapReduce(spec, &result)) abort();
    80    
    81    // Done: 'result' structure contains info
    82    // about counters, time taken, number of
    83    // machines used, etc.
    84    return 0;
    85 }
    86
    
  • 相关阅读:
    qt install (1)
    learning rewind func
    learning strrchr func
    learning memchr func
    git lfs setpu(4)
    大端与小端
    git branch/meger step(3)
    git log/show/HEAD step(2)
    调用外部EXE文件
    获取计算机上的所有进程
  • 原文地址:https://www.cnblogs.com/CBDoctor/p/2932144.html
Copyright © 2011-2022 走看看