zoukankan      html  css  js  c++  java
  • mongodb mapreduce小试

    最近由于产品业务的需求,需要使用一些数据量比较相对有点大的计算,顺便试试mongodb的mapreduce功能,感觉还不错

    下面是官方提供的一个例子:

    $ ./mongo
    > db.things.insert( { _id : 1, tags : ['dog', 'cat'] } );
    > db.things.insert( { _id : 2, tags : ['cat'] } );
    > db.things.insert( { _id : 3, tags : ['mouse', 'cat', 'dog'] } );
    > db.things.insert( { _id : 4, tags : []  } );
    
    > // map function
    > m = function(){
    ...    this.tags.forEach(
    ...        function(z){
    ...            emit( z , { count : 1 } );
    ...        }
    ...    );
    ...};
    
    > // reduce function
    > r = function( key , values ){
    ...    var total = 0;
    ...    for ( var i=0; i<values.length; i++ )
    ...        total += values[i].count;
    ...    return { count : total };
    ...};
    
    > res = db.things.mapReduce(m,r);
    > res
    {"timeMillis.emit" : 9 , "result" : "mr.things.1254430454.3" ,
     "numObjects" : 4 , "timeMillis" : 9 , "errmsg" : "" , "ok" : 0}
    
    > db[res.result].find()
    {"_id" : "cat" , "value" : {"count" : 3}}
    {"_id" : "dog" , "value" : {"count" : 2}}
    {"_id" : "mouse" , "value" : {"count" : 1}} 
    
    > db[res.result].drop()

    mapreduce参数说明

    db.runCommand(
    { 
        mapreduce : <collection>,  
        map : <mapfunction>,    
        reduce : <reducefunction>  
        [, query : <query filter object>]    
        [, sort : <sort the query.  useful for optimization>]    
        [, limit : <number of objects to return from collection>]    
        [, out : <output-collection name>]    
        [, keeptemp: <true|false>]    
        [, finalize : <finalizefunction>]    
        [, scope : <object where fields go into javascript global scope >]    
        [, verbose : true]  
    });

        mapreduce:指定要进行mapreduce处理的collection
        map:map函数
        reduce:reduce函数
        query:一个筛选条件,只有满足条件的行才会加入mapreduce集合,而这个筛选过程是先于整个mapreduce流程而执行的
        sort:和query结合的sort排序参数,这是唯一可以优化分组机制的地方
        limit:同上
        out:结果输出的collection的名字,不指定会默认创建一个随机名字的collection
        keytemp:true或false,表明结果输出到的collection是否是临时的,如果为true,则会在客户端连接中断后自动删除,如果你用的是MongoDB的mongo客户端连接,那必须exit后才会删除。如果是脚本执行,脚本退出或调用close会自动删除结果collection
        finalize:和map,reduce一样是一个函数,它可以在reduce得出一个结果后再对key和value进行一次计算并返回一个最终结果
        scope:设置参数值,在这里设置的值在map,reduce,finalize函数中可见
        verbose:在执行过程中打印调试信息。

    返回格式:

    { 
    result : <collection_name>,   
    counts : {input :  <number of objects scanned>, emit  : <number of times emit was called>, output : <number of items in output collection>} ,
    timeMillis : <job_time>,
    ok : <1_if_ok>,
    [, err : <errmsg_if_error>] 
    }

     下面来一个略微复杂一点的例子,下面是统计房源列表页房源的曝光量:

    mongodb数据格式:

    { "_id" : ObjectId("50364d9fdec7d5ce4000198d"), "pn" : "Listing_V2_IndexPage_All", "guid" : "E200F425-30E7-0D97-9B3A-E047A08CE47C", "uguid" : "4455754C-B2A0-7EDA-6387-A50F0228DE7F", "url" : "http://shanghai.haozu.com/listing/pudong/?from=in_area", "referer" : "http://shanghai.haozu.com/", "site" : "haozu", "stamp" : "1345691212948", "cip" : "116.231.123.184", "sessid" : "B1197AA0-976C-F6EF-BB6F-9401D8E983DD", "cid" : "11", "cstamp" : "1345691178421", "cstparam" : "{\"found\":\"37695\",\"proids\":\"10290023|10353348|8448223|10310737|10311720|10250125|10320886|8507299|10332158|10341287|10266002|10322302|9185878|10273552|10272872|10282252|10270250|10336122|9350169|10196350|8533446|10250019|10335617|10222489\"}", "rfpn" : "Home_Index8Page", "agent" : "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; 360SE; 360SE)" }

    房源id保存在cstparam字段里面,是一个字符串,因此需要正则进行一下匹配,然后取出进行统计
    ,因此对应的map,reduce的写法为:

    map方法:

    var m=function () {
        var arr = this.cstparam.split("\"");
        var str_ids = arr[arr.length - 2];
        var arr_ids = str_ids.split("|");
        for (var i in arr_ids) {
            emit(arr_ids[i], 1);
        }
    }

    reduce方法:

    var reduce=function (key, emits) {
        var count = 0;
        for (var i in emits) {
            count += emits[i];
        }
        return count;
    }

     执行:

    db.log_soj.mapReduce(map,reduce,{out:'result_tmp',query:{'cstparam':{'$exists':true},'cstparam':/proids/}});

    返回结果:

    {
        "result" : "result_tmp",
        "timeMillis" : 18888,
        "counts" : {
            "input" : 15742,
            "emit" : 333011,
            "reduce" : 103137,
            "output" : 150897
        },
        "ok" : 1,
    }

     结果集:

    { "_id" : "10000003", "value" : 1 }
    { "_id" : "10000016", "value" : 2 }
    { "_id" : "10000032", "value" : 1 }
    { "_id" : "10000039", "value" : 1 }
    { "_id" : "10000043", "value" : 1 }
    { "_id" : "10000059", "value" : 1 }

    再来一个,和上例类似,但是按照房源所出现的城市进行曝光量的统计

    map函数:

    function () {
        var arr = this.cstparam.split("\"");
        var str_ids = arr[arr.length - 2];
        var arr_ids = str_ids.split("|");
        for (var i in arr_ids) {
            var key = arr_ids[i] + "_" + this.cid;
            emit(key, {prop_id:arr_ids[i], city_id:this.cid, count:1});
        }
    }

    reduce函数:

    function (key, emits) {
        var total = 0;
        for (var i in emits) {
            total += emits[i].count;
        }
        return {prop_id:emits[0].prop_id, city_id:emits[0].city_id, count:total};
    }

    执行:

    db.log_soj.mapReduce(m1,r1,{out:'result_tmp',query:{'cstparam':{'$exists':true},'cstparam':/proids/}});

    结果:

    { "_id" : "10000003_undefined", "value" : { "prop_id" : "10000003", "city_id" : null, "count" : 1 } }
    { "_id" : "10000016_14", "value" : { "prop_id" : "10000016", "city_id" : "14", "count" : 2 } }
    { "_id" : "10000032_15", "value" : { "prop_id" : "10000032", "city_id" : "15", "count" : 1 } }
    { "_id" : "10000039_15", "value" : { "prop_id" : "10000039", "city_id" : "15", "count" : 1 } }
    { "_id" : "10000043_11", "value" : { "prop_id" : "10000043", "city_id" : "11", "count" : 1 } }
    { "_id" : "10000059_17", "value" : { "prop_id" : "10000059", "city_id" : "17", "count" : 1 } }
    { "_id" : "10000068_11", "value" : { "prop_id" : "10000068", "city_id" : "11", "count" : 1 } }
    { "_id" : "10000099_15", "value" : { "prop_id" : "10000099", "city_id" : "15", "count" : 1 } }
    { "_id" : "10000100_18", "value" : { "prop_id" : "10000100", "city_id" : "18", "count" : 1 } }
    { "_id" : "10000106_14", "value" : { "prop_id" : "10000106", "city_id" : "14", "count" : 1 } }
    { "_id" : "10000109_18", "value" : { "prop_id" : "10000109", "city_id" : "18", "count" : 3 } }
    { "_id" : "10000112_15", "value" : { "prop_id" : "10000112", "city_id" : "15", "count" : 1 } }
    { "_id" : "10000118_15", "value" : { "prop_id" : "10000118", "city_id" : "15", "count" : 1 } }
    { "_id" : "10000156_11", "value" : { "prop_id" : "10000156", "city_id" : "11", "count" : 1 } }
    { "_id" : "10000224_14", "value" : { "prop_id" : "10000224", "city_id" : "14", "count" : 1 } }
    { "_id" : "10000250_22", "value" : { "prop_id" : "10000250", "city_id" : "22", "count" : 1 } }
    { "_id" : "10000262_25", "value" : { "prop_id" : "10000262", "city_id" : "25", "count" : 1 } }
    { "_id" : "10000267_14", "value" : { "prop_id" : "10000267", "city_id" : "14", "count" : 3 } }
    { "_id" : "10000305_14", "value" : { "prop_id" : "10000305", "city_id" : "14", "count" : 3 } }
    { "_id" : "10000323_11", "value" : { "prop_id" : "10000323", "city_id" : "11", "count" : 1 } }

    转载请注明出处:

    http://www.cnblogs.com/xiazh/archive/2012/09/05/2671730.html

  • 相关阅读:
    jquery toggle(listenerOdd, listenerEven)
    struts quick start
    hdu 1518 Square (dfs)
    hdu 2544 最短路 (最短路径)
    hdu 1754 I Hate It (线段树)
    hdu 1856 More is better (并查集)
    hdu 1358 Period (KMP)
    hdu 2616 Kill the monster (DFS)
    hdu 2579 Dating with girls(2) (bfs)
    zoj 2110 Tempter of the Bone (dfs)
  • 原文地址:https://www.cnblogs.com/xiazh/p/2671730.html
Copyright © 2011-2022 走看看