I am processing a stream of text data where I don't know ahead of time what the distribution of its values are, but I know each one looks like this:
{
"datetime": "1986-11-03T08:30:00-07:00",
"word": "wordA",
"value": "someValue"
}
I'm trying to bucket it into RethinkDB objects based on it's value, where the objects look like the following:
{
"bucketId": "1",
"bucketValues": {
"wordA": [
{"datetime": "1986-11-03T08:30:00-07:00"},
{"datetime": "1986-11-03T08:30:00-07:00"}
],
"wordB": [
{"datetime": "1986-11-03T08:30:00-07:00"},
{"datetime": "1986-11-03T08:30:00-07:00"}
]
}
}
The purpose is to eventually count the number of occurrences for each word in each bucket.
Since I'm dealing with about a million buckets, and have no knowledge of the words ahead of time, the plan is to create this objects on the fly. I am new to RethinkDB, however, and I have tried my best to do this in such a way that I don't attempt to add a word
key to a bucket that doesn't exist yet, but I am not entirely sure if I'm following best-practice here chaining the commands as follows (note that I am running this on a Node.js server using :
var bucketId = "someId";
var word = "someWordValue"
r.do(r.table("buckets").get(bucketId), function(result) {
return r.branch(
// If the bucket doesn't exist
result.eq(null),
// Create it
r.table("buckets").insert({
"id": bucketId,
"bucketValues" : {}
}),
// Else do nothing
"Bucket already exists"
);
})
.run()
.then(function(result) {
console.log(result);
r.table("buckets").get(bucketId)
.do(function(bucket) {
return r.branch(
// if the word already exists
bucket("bucketValues").keys().contains(word),
// Just append to it (code not implemented yet)
"Word already exists",
// Else create the word and append it
r.table("buckets").get(bucketId).update(
{"bucketValues": r.object(word, [/*Put the timestamp here*/])}
)
);
})
.run()
.then(function(result) {
console.log(result);
});
});
Do I need to execute run here twice, or am I way off base on how you're supposed to properly chain things together with RethinkDB? I just want to make sure I'm not doing this the wrong/hard way before I get much deeper into this.