zoukankan      html  css  js  c++  java
  • mongodb mapreduce小试

    最近由于产品业务的需求,需要使用一些数据量比较相对有点大的计算,顺便试试mongodb的mapreduce功能,感觉还不错

    下面是官方提供的一个例子:

    $ ./mongo
    > db.things.insert( { _id : 1, tags : ['dog', 'cat'] } );
    > db.things.insert( { _id : 2, tags : ['cat'] } );
    > db.things.insert( { _id : 3, tags : ['mouse', 'cat', 'dog'] } );
    > db.things.insert( { _id : 4, tags : []  } );
    
    > // map function
    > m = function(){
    ...    this.tags.forEach(
    ...        function(z){
    ...            emit( z , { count : 1 } );
    ...        }
    ...    );
    ...};
    
    > // reduce function
    > r = function( key , values ){
    ...    var total = 0;
    ...    for ( var i=0; i<values.length; i++ )
    ...        total += values[i].count;
    ...    return { count : total };
    ...};
    
    > res = db.things.mapReduce(m,r);
    > res
    {"timeMillis.emit" : 9 , "result" : "mr.things.1254430454.3" ,
     "numObjects" : 4 , "timeMillis" : 9 , "errmsg" : "" , "ok" : 0}
    
    > db[res.result].find()
    {"_id" : "cat" , "value" : {"count" : 3}}
    {"_id" : "dog" , "value" : {"count" : 2}}
    {"_id" : "mouse" , "value" : {"count" : 1}} 
    
    > db[res.result].drop()

    mapreduce参数说明

    db.runCommand(
    { 
        mapreduce : <collection>,  
        map : <mapfunction>,    
        reduce : <reducefunction>  
        [, query : <query filter object>]    
        [, sort : <sort the query.  useful for optimization>]    
        [, limit : <number of objects to return from collection>]    
        [, out : <output-collection name>]    
        [, keeptemp: <true|false>]    
        [, finalize : <finalizefunction>]    
        [, scope : <object where fields go into javascript global scope >]    
        [, verbose : true]  
    });

        mapreduce:指定要进行mapreduce处理的collection
        map:map函数
        reduce:reduce函数
        query:一个筛选条件,只有满足条件的行才会加入mapreduce集合,而这个筛选过程是先于整个mapreduce流程而执行的
        sort:和query结合的sort排序参数,这是唯一可以优化分组机制的地方
        limit:同上
        out:结果输出的collection的名字,不指定会默认创建一个随机名字的collection
        keytemp:true或false,表明结果输出到的collection是否是临时的,如果为true,则会在客户端连接中断后自动删除,如果你用的是MongoDB的mongo客户端连接,那必须exit后才会删除。如果是脚本执行,脚本退出或调用close会自动删除结果collection
        finalize:和map,reduce一样是一个函数,它可以在reduce得出一个结果后再对key和value进行一次计算并返回一个最终结果
        scope:设置参数值,在这里设置的值在map,reduce,finalize函数中可见
        verbose:在执行过程中打印调试信息。

    返回格式:

    { 
    result : <collection_name>,   
    counts : {input :  <number of objects scanned>, emit  : <number of times emit was called>, output : <number of items in output collection>} ,
    timeMillis : <job_time>,
    ok : <1_if_ok>,
    [, err : <errmsg_if_error>] 
    }

     下面来一个略微复杂一点的例子,下面是统计房源列表页房源的曝光量:

    mongodb数据格式:

    { "_id" : ObjectId("50364d9fdec7d5ce4000198d"), "pn" : "Listing_V2_IndexPage_All", "guid" : "E200F425-30E7-0D97-9B3A-E047A08CE47C", "uguid" : "4455754C-B2A0-7EDA-6387-A50F0228DE7F", "url" : "http://shanghai.haozu.com/listing/pudong/?from=in_area", "referer" : "http://shanghai.haozu.com/", "site" : "haozu", "stamp" : "1345691212948", "cip" : "116.231.123.184", "sessid" : "B1197AA0-976C-F6EF-BB6F-9401D8E983DD", "cid" : "11", "cstamp" : "1345691178421", "cstparam" : "{\"found\":\"37695\",\"proids\":\"10290023|10353348|8448223|10310737|10311720|10250125|10320886|8507299|10332158|10341287|10266002|10322302|9185878|10273552|10272872|10282252|10270250|10336122|9350169|10196350|8533446|10250019|10335617|10222489\"}", "rfpn" : "Home_Index8Page", "agent" : "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; 360SE; 360SE)" }

    房源id保存在cstparam字段里面,是一个字符串,因此需要正则进行一下匹配,然后取出进行统计
    ,因此对应的map,reduce的写法为:

    map方法:

    var m=function () {
        var arr = this.cstparam.split("\"");
        var str_ids = arr[arr.length - 2];
        var arr_ids = str_ids.split("|");
        for (var i in arr_ids) {
            emit(arr_ids[i], 1);
        }
    }

    reduce方法:

    var reduce=function (key, emits) {
        var count = 0;
        for (var i in emits) {
            count += emits[i];
        }
        return count;
    }

     执行:

    db.log_soj.mapReduce(map,reduce,{out:'result_tmp',query:{'cstparam':{'$exists':true},'cstparam':/proids/}});

    返回结果:

    {
        "result" : "result_tmp",
        "timeMillis" : 18888,
        "counts" : {
            "input" : 15742,
            "emit" : 333011,
            "reduce" : 103137,
            "output" : 150897
        },
        "ok" : 1,
    }

     结果集:

    { "_id" : "10000003", "value" : 1 }
    { "_id" : "10000016", "value" : 2 }
    { "_id" : "10000032", "value" : 1 }
    { "_id" : "10000039", "value" : 1 }
    { "_id" : "10000043", "value" : 1 }
    { "_id" : "10000059", "value" : 1 }

    再来一个,和上例类似,但是按照房源所出现的城市进行曝光量的统计

    map函数:

    function () {
        var arr = this.cstparam.split("\"");
        var str_ids = arr[arr.length - 2];
        var arr_ids = str_ids.split("|");
        for (var i in arr_ids) {
            var key = arr_ids[i] + "_" + this.cid;
            emit(key, {prop_id:arr_ids[i], city_id:this.cid, count:1});
        }
    }

    reduce函数:

    function (key, emits) {
        var total = 0;
        for (var i in emits) {
            total += emits[i].count;
        }
        return {prop_id:emits[0].prop_id, city_id:emits[0].city_id, count:total};
    }

    执行:

    db.log_soj.mapReduce(m1,r1,{out:'result_tmp',query:{'cstparam':{'$exists':true},'cstparam':/proids/}});

    结果:

    { "_id" : "10000003_undefined", "value" : { "prop_id" : "10000003", "city_id" : null, "count" : 1 } }
    { "_id" : "10000016_14", "value" : { "prop_id" : "10000016", "city_id" : "14", "count" : 2 } }
    { "_id" : "10000032_15", "value" : { "prop_id" : "10000032", "city_id" : "15", "count" : 1 } }
    { "_id" : "10000039_15", "value" : { "prop_id" : "10000039", "city_id" : "15", "count" : 1 } }
    { "_id" : "10000043_11", "value" : { "prop_id" : "10000043", "city_id" : "11", "count" : 1 } }
    { "_id" : "10000059_17", "value" : { "prop_id" : "10000059", "city_id" : "17", "count" : 1 } }
    { "_id" : "10000068_11", "value" : { "prop_id" : "10000068", "city_id" : "11", "count" : 1 } }
    { "_id" : "10000099_15", "value" : { "prop_id" : "10000099", "city_id" : "15", "count" : 1 } }
    { "_id" : "10000100_18", "value" : { "prop_id" : "10000100", "city_id" : "18", "count" : 1 } }
    { "_id" : "10000106_14", "value" : { "prop_id" : "10000106", "city_id" : "14", "count" : 1 } }
    { "_id" : "10000109_18", "value" : { "prop_id" : "10000109", "city_id" : "18", "count" : 3 } }
    { "_id" : "10000112_15", "value" : { "prop_id" : "10000112", "city_id" : "15", "count" : 1 } }
    { "_id" : "10000118_15", "value" : { "prop_id" : "10000118", "city_id" : "15", "count" : 1 } }
    { "_id" : "10000156_11", "value" : { "prop_id" : "10000156", "city_id" : "11", "count" : 1 } }
    { "_id" : "10000224_14", "value" : { "prop_id" : "10000224", "city_id" : "14", "count" : 1 } }
    { "_id" : "10000250_22", "value" : { "prop_id" : "10000250", "city_id" : "22", "count" : 1 } }
    { "_id" : "10000262_25", "value" : { "prop_id" : "10000262", "city_id" : "25", "count" : 1 } }
    { "_id" : "10000267_14", "value" : { "prop_id" : "10000267", "city_id" : "14", "count" : 3 } }
    { "_id" : "10000305_14", "value" : { "prop_id" : "10000305", "city_id" : "14", "count" : 3 } }
    { "_id" : "10000323_11", "value" : { "prop_id" : "10000323", "city_id" : "11", "count" : 1 } }

    转载请注明出处:

    http://www.cnblogs.com/xiazh/archive/2012/09/05/2671730.html

  • 相关阅读:
    OpenCV 2.4.9
    开机黑屏 仅仅显示鼠标 电脑黑屏 仅仅有鼠标 移动 [已成功解决]
    吐槽一下CSDN的封停审查机制
    【课程分享】Oracle数据库系统project师
    Html的空格显示
    iOS UIWebView 访问https 绕过证书验证的方法
    Java实现 蓝桥杯VIP 算法训练 整除问题
    Java实现 蓝桥杯VIP 算法训练 数位分离
    Java实现 蓝桥杯VIP 算法训练 薪水计算
    Java实现 蓝桥杯VIP 算法训练 完数
  • 原文地址:https://www.cnblogs.com/xiazh/p/2671730.html
Copyright © 2011-2022 走看看