An alternative workaround would be to modify your data structure by introducing another field in your schema which holds only the domain value of the email address. This can be done with a bulk update using the Bulk API operations that give a better write response i.e. useful information about what actually happened during the update:
var bulk = db.my_emails.initializeUnorderedBulkOp(),
count = 0;
db.my_emails.find().forEach(function(doc) {
var domain = doc.email.replace(/.*@/, ""),
update = { domain: domain };
bulk.find({ "_id": doc._id }).updateOne({
"$set": update
})
count++;
if (count % 1000 == 0) {
bulk.execute();
bulk = db.my_emails.initializeUnorderedBulkOp();
}
})
if (count % 1000 != 0) { bulk.execute(); }
Bulk update response from sample:
BulkWriteResult({
"writeErrors" : [ ],
"writeConcernErrors" : [ ],
"nInserted" : 0,
"nUpserted" : 0,
"nMatched" : 3,
"nModified" : 3,
"nRemoved" : 0,
"upserted" : [ ]
})
After this update, a query on the collection db.my_emails.find().pretty()
will yield:
{
"_id" : ObjectId("561618af645a64b1a70af2c5"),
"email" : "russel@gmail.com",
"domain" : "gmail.com"
}
{
"_id" : ObjectId("561618af645a64b1a70af2c6"),
"email" : "mickey@yahoo.com",
"domain" : "yahoo.com"
}
{
"_id" : ObjectId("561618af645a64b1a70af2c7"),
"email" : "john@yahoo.com",
"domain" : "yahoo.com"
}
Now, having the domain field will make it easier for the aggregation framework to give you the host count through the $sum
operator in the $group
pipeline. The following pipeline operation will return the desired outcome:
db.my_emails.aggregate([
{
"$group": {
"_id": "$domain",
"count": { "$sum": 1 }
}
}
])
Output:
{ "_id" : "yahoo.com", "count" : 2 }
{ "_id" : "gmail.com", "count" : 1 }