We have a big database. We collect newsletters and I want to make a trending page. The goal is to make the page realtime and fast! We want to display trending newsletters from the past 2 hours, 4 hours, 24 hours, past week, and past month.
I've worked with MongoDB
for a while and I try to keep things simple. I want a new collection, trending, that stores the visitors of the newsletter pages in a time bucket. On every visit, I want to add the information of the newsletter to the object that holds the trending newsletters for that time and $inc
the hits
field for statistics.
My objects are:
{
"_id" : ObjectId("5d4b4ca5a6bba5f7ffb23b39"),
"bucket" : "last2hours",
"language" : "nl",
"time" : "2019-08-08_00",
"newsletters" : {
"5d4b29ba8ddf870fe15628c7" : {
"_id" : ObjectId("5d4b29ba8ddf870fe15628c7"),
"_slug" : "nieuwsbrief-dalstra-reizen-touring-december-2015",
"subject" : "Nieuwsbrief Dalstra Reizen Touring december 2015",
"date" : ISODate("2015-12-04T13:15:03.000+0000"),
"publisher" : {
"_id" : ObjectId("557ebcc54c79597761fd71c2"),
"_slug" : "dalstra-nl",
"name" : "dalstra.nl",
"taal" : "nl"
},
"hits" : NumberInt(1)
},
"5d4b29af8ddf870fe15624ba" : {
"_id" : ObjectId("5d4b29af8ddf870fe15624ba"),
"_slug" : "the-carolina-weddings-show",
"subject" : "The Carolina Weddings Show",
"date" : ISODate("2015-12-04T13:13:54.000+0000"),
"publisher" : {
"_id" : ObjectId("503b950fffa67e2c790007d7"),
"_slug" : "livingsocialcom",
"name" : "Livingsocial.com",
"taal" : "nl"
},
"hits" : NumberInt(1)
},
"5d4b29ad8ddf870fe15623f4" : {
"_id" : ObjectId("5d4b29ad8ddf870fe15623f4"),
"_slug" : "newport-gangster-tour",
"subject" : "Newport Gangster Tour",
"date" : ISODate("2015-12-04T13:13:22.000+0000"),
"publisher" : {
"_id" : ObjectId("503b950fffa67e2c790007d7"),
"_slug" : "livingsocialcom",
"name" : "Livingsocial.com",
"taal" : "nl"
},
"hits" : NumberInt(1)
},
"5d4b29bb8ddf870fe15628f3" : {
"_id" : ObjectId("5d4b29bb8ddf870fe15628f3"),
"_slug" : "springwise-daily-shoe-insoles-control-devices-through-kicking-and-more",
"subject" : "Springwise Daily | Shoe insoles control devices through kicking, and more.",
"date" : ISODate("2015-12-04T13:15:05.000+0000"),
"publisher" : {
"_id" : ObjectId("5581f0b54c7959e82bfd71c2"),
"_slug" : "springwise-com",
"name" : "springwise.com",
"taal" : "nl"
},
"hits" : NumberInt(2)
}
}
}
{
"_id" : ObjectId("5d4b4ca5a6bba5f7ffb23b3b"),
"bucket" : "last2hours",
"language" : "nl",
"time" : "2019-08-08_01",
"newsletters" : {
"5d4b29ba8ddf870fe15628c7" : {
"_id" : ObjectId("5d4b29ba8ddf870fe15628c7"),
"_slug" : "nieuwsbrief-dalstra-reizen-touring-december-2015",
"subject" : "Nieuwsbrief Dalstra Reizen Touring december 2015",
"date" : ISODate("2015-12-04T13:15:03.000+0000"),
"publisher" : {
"_id" : ObjectId("557ebcc54c79597761fd71c2"),
"_slug" : "dalstra-nl",
"name" : "dalstra.nl",
"taal" : "nl"
},
"hits" : NumberInt(1)
},
"5d4b29af8ddf870fe15624ba" : {
"_id" : ObjectId("5d4b29af8ddf870fe15624ba"),
"_slug" : "the-carolina-weddings-show",
"subject" : "The Carolina Weddings Show",
"date" : ISODate("2015-12-04T13:13:54.000+0000"),
"publisher" : {
"_id" : ObjectId("503b950fffa67e2c790007d7"),
"_slug" : "livingsocialcom",
"name" : "Livingsocial.com",
"taal" : "nl"
},
"hits" : NumberInt(1)
},
"5d4b29ad8ddf870fe15623f4" : {
"_id" : ObjectId("5d4b29ad8ddf870fe15623f4"),
"_slug" : "newport-gangster-tour",
"subject" : "Newport Gangster Tour",
"date" : ISODate("2015-12-04T13:13:22.000+0000"),
"publisher" : {
"_id" : ObjectId("503b950fffa67e2c790007d7"),
"_slug" : "livingsocialcom",
"name" : "Livingsocial.com",
"taal" : "nl"
},
"hits" : NumberInt(1)
},
"5d4b29bb8ddf870fe15628f3" : {
"_id" : ObjectId("5d4b29bb8ddf870fe15628f3"),
"_slug" : "springwise-daily-shoe-insoles-control-devices-through-kicking-and-more",
"subject" : "Springwise Daily | Shoe insoles control devices through kicking, and more.",
"date" : ISODate("2015-12-04T13:15:05.000+0000"),
"publisher" : {
"_id" : ObjectId("5581f0b54c7959e82bfd71c2"),
"_slug" : "springwise-com",
"name" : "springwise.com",
"taal" : "nl"
},
"hits" : NumberInt(2)
}
}
}
{
"_id" : ObjectId("5d4b4ca5a6bba5f7ffb23b3d"),
"bucket" : "last4hours",
"language" : "nl",
"time" : "2019-08-08_00",
"newsletters" : {
"5d4b29ba8ddf870fe15628c7" : {
"_id" : ObjectId("5d4b29ba8ddf870fe15628c7"),
"_slug" : "nieuwsbrief-dalstra-reizen-touring-december-2015",
"subject" : "Nieuwsbrief Dalstra Reizen Touring december 2015",
"date" : ISODate("2015-12-04T13:15:03.000+0000"),
"publisher" : {
"_id" : ObjectId("557ebcc54c79597761fd71c2"),
"_slug" : "dalstra-nl",
"name" : "dalstra.nl",
"taal" : "nl"
},
"hits" : NumberInt(1)
},
"5d4b29af8ddf870fe15624ba" : {
"_id" : ObjectId("5d4b29af8ddf870fe15624ba"),
"_slug" : "the-carolina-weddings-show",
"subject" : "The Carolina Weddings Show",
"date" : ISODate("2015-12-04T13:13:54.000+0000"),
"publisher" : {
"_id" : ObjectId("503b950fffa67e2c790007d7"),
"_slug" : "livingsocialcom",
"name" : "Livingsocial.com",
"taal" : "nl"
},
"hits" : NumberInt(1)
},
"5d4b29ad8ddf870fe15623f4" : {
"_id" : ObjectId("5d4b29ad8ddf870fe15623f4"),
"_slug" : "newport-gangster-tour",
"subject" : "Newport Gangster Tour",
"date" : ISODate("2015-12-04T13:13:22.000+0000"),
"publisher" : {
"_id" : ObjectId("503b950fffa67e2c790007d7"),
"_slug" : "livingsocialcom",
"name" : "Livingsocial.com",
"taal" : "nl"
},
"hits" : NumberInt(1)
},
"5d4b29bb8ddf870fe15628f3" : {
"_id" : ObjectId("5d4b29bb8ddf870fe15628f3"),
"_slug" : "springwise-daily-shoe-insoles-control-devices-through-kicking-and-more",
"subject" : "Springwise Daily | Shoe insoles control devices through kicking, and more.",
"date" : ISODate("2015-12-04T13:15:05.000+0000"),
"publisher" : {
"_id" : ObjectId("5581f0b54c7959e82bfd71c2"),
"_slug" : "springwise-com",
"name" : "springwise.com",
"taal" : "nl"
},
"hits" : NumberInt(2)
}
}
}
The goal here is to only have to query the bucket language time. So, if I want to see the trending newsletters of the last 2 hours, I query {bucket: 'last2hours', language: 'nl', time: '2019-08-08_00'}
, then I have all the information I need. No need for aggregation. This findOne
query is fast.
So i made a method to update the trending collection:
public function setNewsletterTrendingStatistics($newsletter){
// Buckets
$trend_buckets = array(
'last2hours' => array('steps' => 2, 'step'=>'hour', 'format'=> 'Y-m-d_H'),
'last4hours' => array('steps' => 4, 'step'=>'hour', 'format'=> 'Y-m-d_H' ),
'last1day' => array('steps' => 24, 'step'=>'hour', 'format'=> 'Y-m-d_H' ),
'lastweek' => array('steps' => 7, 'step'=>'day', 'format'=> 'Y-m-d' ),
'lastmonth' => array('steps' => 31, 'step'=>'day', 'format'=> 'Y-m-d' ),
);
// $newsletter['date']->toDateTime()->format('U')
$buckets = array();
foreach($trend_buckets AS $bucket => $settings){
for($i=0; $i<$settings['steps']; $i++){
$buckets[] = array(
'bucket' => $bucket,
'time' => date($settings['format'], strtotime('+'. $i . ' ' . $settings['step'])),
'language' => $newsletter['publisher']['taal'],
);
}
}
// Add the stats to each bucket
foreach($buckets AS $bucket){
$query = array();
$query = $bucket;
$update = array(
'$set' => array(
'newsletters.' . (string) $newsletter['_id'] . '._id' => $newsletter['_id'],
'newsletters.' . (string) $newsletter['_id'] . '._slug' => $newsletter['_slug'],
'newsletters.' . (string) $newsletter['_id'] . '.subject' => $newsletter['subject'],
'newsletters.' . (string) $newsletter['_id'] . '.date' => $newsletter['date'],
'newsletters.' . (string) $newsletter['_id'] . '.publisher' => array(
'_id' => $newsletter['publisher']['_id'],
'_slug' => $newsletter['publisher']['_slug'],
'name' => $newsletter['publisher']['name'],
'taal' => $newsletter['publisher']['taal'],
),
),
'$inc' => array(
'newsletters.' . (string) $newsletter['_id'] . '.hits' => 1
),
);
$options = array('upsert'=>true);
$this->FW->mdb->{$this->config['collections']['newsletters_trending']}->updateOne($query, $update, $options);
}
}
First of all, is this a good approach? Is there a better approach? Second, I want to count unique hits, so I need to save an IP address
. I want to count unique hits on the update
query so I don't have to count on the findOne
query. Whats the best way to achieve this? I know I can use addtoset
for a unique array with IP addresses
. But then I need to count these unique IP addresses
.