4

I see plenty of questions on SO about aggregation in MongoDB, however, I have not found a complete solution to mine yet.

Here's an example of my data:

{
    "fruits" : {
        "apple" : "red",
        "orange" : "orange",
        "plum" : "purple"
    }
}
{
    "fruits" : {
        "apple" : "green",
        "plum" : "purple"
    }
}
{
    "fruits" : {
        "apple" : "red",
        "orange" : "yellow",
        "plum" : "purple"
    }
}

Now, my goal is to determine the popularity of each color for each fruit, so something like this would be the output collection:

{
    "_id" : "apple"
    "values" : {
        "red" : 2,
        "green" : 1
    }
}
{
    "_id" : "orange"
    "values" : {
        "orange" : 1,
        "yellow" : 1
    }
}
{
    "_id" : "plum"
    "values" : {
        "purple" : 3
    }
}

I have tried various M/R functions, and in the end they either don't work, or they take exponentially long. In the context of the example (fruit), I have about 1,000 different fruits and 100,000 colors over about 10,000,000 total documents. My current working M/R is this:

map = function() {
    if (!this.fruits) return;
    for (var fruit in this.fruits) {
        emit(fruit, {
            val_array: [
                {value: this.fruits[fruit], count: 1}
            ]
        });
    }
};

reduce = function(key, values) {
    var collection = {
        val_array: []
    };
    var found = false;
    values.forEach(function(map_obj) {
        map_obj.val_array.forEach(function(value_obj) {
            found = false;
            // if exists in collection, inc, else add
            collection.val_array.forEach(function(coll_obj) {
                if (coll_obj.value == value_obj.value) {
                    // the collection already has this object, increment it
                    coll_obj.count += value_obj.count;
                    found = true;
                    return;
                }
            });
            if (!found) {
                // the collection doesn't have this obj yet, push it
                collection.val_array.push(value_obj);
            }
        });
    });
    return collection;
};

Now, this does work, and for 100 records, it takes just a second or so, but the time increases non-linearly, so 100M records would take a very long time. The problem is that I'm doing a poor-mans sub-aggregation in the reduce function with the collection array, thus requiring me to iterate over both collection and the values from my map function. Now I just need to figure out how to do this efficiently (even if it requires multiple reductions). Any suggestions are welcome!


EDIT For lack of a better place to post it, here's my solution.
First, I created a file called mr.js:
map = function() {
    if (!this.fruits) return;
    var skip_fruits = {
        'Watermelon':1,
        'Grapefruit':1,
        'Tomato':1 // yes, a tomato is a fruit
    }
    for (var fruit in this.fruits) {
        if (skip_fruits[fruit]) continue;
        var obj = {};
        obj[this.fruits[fruit]] = 1;
        emit(fruit, obj);
    }
};

reduce = function(key, values) {
    var out_values = {};
    values.forEach(function(v) {
        for(var k in v) { // iterate values
            if (!out_values[k]) {
                out_values[k] = v[k]; // init missing counter
            } else {
                out_values[k] += v[k];
            }
        }
    });
    return out_values;
};

var in_coll = "fruit_repo";
var out_coll = "fruit_agg_so";
var total_docs = db[in_coll].count();
var page_size = 100000;
var pages = Math.floor(total_docs / page_size);
print('Starting incremental MR job with '+pages+' pages');
db[out_coll].drop();
for (var i=0; i<pages; i++) {
    var skip = page_size * i;
    print("Calculating page limits for "+skip+" - "+(skip+page_size-1)+"...");
    var start_date = db[in_coll].find({},{date:1}).sort({date:1}).skip(skip).limit(1)[0].date;
    var end_date = db[in_coll].find({},{date:1}).sort({date:1}).skip(skip+page_size-1).limit(1)[0].date;
    var mr_command = {
        mapreduce: in_coll,
        map: map,
        reduce: reduce,
        out: {reduce: out_coll},
        sort: {date: 1},
        query: {
            date: {
                $gte: start_date,
                $lt: end_date
            }
        },
        limit: (page_size - 1)
    };
    print("Running mapreduce for "+skip+" - "+(skip+page_size-1));
    db[in_coll].runCommand(mr_command);
}

That file iterates over my entire collection, incrementally map/reducing 100k docs (sorted by date which MUST have an index!) at a time, and reducing them into a single output collection. It's used like this: mongo db_name mr.js.

Then, after a couple hours, I've got a collection with all the info. To figure out which fruits have the most colors, I use this from the mongo shell to print out the top 25:

// Show number of number of possible values per key
var keys = [];
for (var c = db.fruit_agg_so.find(); c.hasNext();) {
    var obj = c.next();
    if (!obj.value) break;
    var len=0;for(var l in obj.value){len++;}
    keys.push({key: obj['_id'], value: len});
}
keys.sort(function(a, b){
    if (a.value == b.value) return 0;
    return (a.value > b.value)? -1: 1;
});
for (var i=0; i<20; i++) {
    print(keys[i].key+':'+keys[i].value);
}

The really cool thing about this approach is that since it's incremental, I can work with the output data while the mapreduce is running.

Neil Lunn
  • 148,042
  • 36
  • 346
  • 317
SteveK
  • 996
  • 1
  • 8
  • 11

2 Answers2

8

It seems that you don't really need val_array. Why not use a simple hash? Try this:

map = function() {
    if (!this.fruits) return;
    for (var fruit in this.fruits) {
        emit(fruit, 
             {this.fruits[fruit]: 1});
    }
};

reduce = function(key, values) {
  var colors = {};

  values.forEach(function(v) {
    for(var k in v) { // iterate colors
      if(!colors[k]) // init missing counter
        colors[k] = 0

      color[k] += v[k];
    }
  });

  return colors;
}
Sergio Tulentsev
  • 226,338
  • 43
  • 373
  • 367
  • Wow, I was really over thinking that one, wasn't I! This does indeed do exactly what I wanted. I tested it with 100, 1,000 and 100,000 records and it's running about 20k/sec for each set (apparently linear at these sized). I'm running the full 10M records now and I can see that as the batches of mapped data get larger, it's taking considerably longer to reduce them (the `colors` object must be growing): `"secs_running" : 488, "msg": "m/r: (1/3) emit phase 383999/10752083 3%"`. – SteveK May 06 '12 at 15:22
  • Btw, I couldn't use `emit(fruit, {this.fruits[fruit]: 1});` because the key was dynamically generated, so I used this JS hack instead: `var obj = {}; obj[this.fruits[fruit]] = 1; emit(fruit, obj);`. – SteveK May 06 '12 at 15:25
  • I'd suggest trying partial jobs then. That is, process documents in batches of 100k (or whatever) and then reduce it in one final job. This can be tricky to implement, so if it's a one-off, I wouldn't bother. :) – Sergio Tulentsev May 06 '12 at 15:26
  • @SteveK: that's not a hack. :) – Sergio Tulentsev May 06 '12 at 15:27
  • Bad news, like we suspected, my data is simply too large to process with this M/R. The job has been running for a couple hours now an the estimated completion (if the rest of the job is linear) is Thu, 24 Apr 2053 08:53:10 :P It looks like I can do 100k batches efficiently, so I think I'll go that route! I suppose I'll need to M/R the data into different collections, then write a script to combine the results, or perhaps I'll M/R each distinct fruit separately. Thanks for the help! – SteveK May 06 '12 at 17:33
0

I'm sorry to tell you this, but the MongoDB MapReduce framework is incredibly slow and will probably continue to be so for "quite a while" (I wouldn't expect an improvement to be on their roadmap).

Simply, my answer would be that I wouldn't do that with Mongo-MapReduce, but instead focus on implementing it with the help of The New Aggregation Framework: http://docs.mongodb.org/manual/reference/aggregation/

or running Hadoop on top: http://www.slideshare.net/spf13/mongodb-and-hadoop (nice and simple intro)

I've also had issues with MongoDB being slow when using the implemented MapReduce-functionality, and my conclusion is that even when doing the most simple tasks, it doesn't even get near the two solutions above when it comes to performance. You could easily process >1M docs/sec (or even more) on commodity hardware using the new aggregation framework.

Joe
  • 921
  • 1
  • 10
  • 16