3

I'm trying to run a lambda that inserts metadata into my db every time an object is put into my S3 bucket.

Because I am using MongoDB I have code to persist db connections in between calls. The problem I am having is that I can't get the metadata.

All the code for inserting into the DB has been done, I just need help getting the metadata from an aws lambda.

Here is my code (it is mostly copied from the MongoDB site)

"use strict";
const MongoClient = require('mongodb').MongoClient;
const MONGODB_URI = 'mongodb://cam_writer:1%40kGM%26LL%26gA5y7NVk1cvl9@cluster0-shard-00-00-hlygq.mongodb.net:27017,cluster0-shard-00-01-hlygq.mongodb.net:27017,cluster0-shard-00-02-hlygq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin&retryWrites=true'; // or Atlas connection string

var AWS = require('aws-sdk')
var s3 = AWS.S3()
let cachedDb = null;

function connectToDatabase (uri) {

  console.log('=> connect to database');

  if (cachedDb) {
    console.log('=> using cached database instance');
    return Promise.resolve(cachedDb);
  }



  return MongoClient.connect(uri)
    .then(client => {
      cachedDb = client.db('events');
      return cachedDb;
    });

}


function queryDatabase (db) {
  console.log('=> query database');

  return db.collection('detection_events').find({}).toArray()
    .then(() => { return { statusCode: 200, body: 'success' }; })
    .catch(err => {
      console.log('=> an error occurred: ', err);
      return { statusCode: 500, body: 'error' };
    });
}

function insertIntoDb (db, obj) {
  console.log('=> inserting data into db');

  return db.collection('detection_events').insertOne(obj)
}

module.exports.handler = (event, context, callback) => {

  context.callbackWaitsForEmptyEventLoop = false;

  console.log(event)

  var meta = {test : "test", "key": event}; // HOW DO I GET THE ACTUAL METADATA FOR THE EVENT?

  console.log('event: ', event);

  connectToDatabase(MONGODB_URI)
    .then(db => insertIntoDb(db, meta))
    .then(result => {
      console.log('=> returning result: ', result);
      callback(null, result);
    })
    .catch(err => {
      console.log('=> an error occurred: ', err);
      callback(err);
    });
};

I know that the "event" passed into a lambda by s3 does not contain the metadata. In python I was able to get the metadata by using boto3, I just don't know how to do it in node.js, (let alone node.js in an aws lambda)

EDIT:

So I've updated my code as per the first answer below. The code is now:

"use strict";
const MongoClient = require('mongodb').MongoClient;
const MONGODB_URI = 'mongodb://cam_writer:1%40kGM%26LL%26gA5y7NVk1cvl9@cluster0-shard-00-00-hlygq.mongodb.net:27017,cluster0-shard-00-01-hlygq.mongodb.net:27017,cluster0-shard-00-02-hlygq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin&retryWrites=true'; // or Atlas connection string

const AWS = require('aws-sdk')
const s3 = new AWS.S3()
let cachedDb = null;

const connectToDatabase = uri => {

    console.log('=> connect to database');

    if (cachedDb) {
        console.log('=> using cached database instance');
        return Promise.resolve(cachedDb);
    }

    return MongoClient.connect(uri)
        .then(client => {
            cachedDb = client.db('events');
            return Promise.resolve(cachedDb);
        });

}

function insertIntoDb(db, obj) {
    console.log('=> inserting data into db');

    return db.collection('detection_events').insertOne(obj)
}

module.exports.handler = async (event) => {

    const db = await connectToDatabase(MONGODB_URI);

    //finally get the HEAD for the s3Object
    const head = await s3.headObject({
        Bucket: event.Records[0].s3.bucket.name,
        Key: event.Records[0].s3.object.key
    }).promise();

    var meta = head['Metadata']
    meta['bucket'] = event.Records[0].s3.bucket.name,
    meta['key'] = event.Records[0].s3.object.key
    console.log(meta)

    const result = await insertIntoDb(db, meta)

    console.log(result)
    return {
        statusCode: 201,
        body: JSON.stringify(result)
    }
};

I ran my code, which inserts a bunch of images into the s3. This geenrated about 25 connections in mongodb, how can i keep the connections low with the lambda? I thought the code copied from the mongoDB website would allow me to do this

A_toaster
  • 1,196
  • 3
  • 22
  • 50

1 Answers1

3

Since you're using an S3 Event, you can get the s3 bucket and key by accessing event.Records[0].s3.bucket.name and event.Records[0].s3.object.key. It could be easily done with the following code:

const params = {
  Bucket: event.Records[0].s3.bucket.name, 
  Key: event.Records[0].s3.object.key
 };
 s3.headObject(params, function(err, data) {
  if (err) {
    console.log(err, err.stack);
    return;
  }
  console.log(data)
});

Just make sure you put this inside your DB callback, otherwise, you'll lose track of it.

I'd highly recommend you to use async/await though, as you won't have to deal with the famous callback hell. Here's the refactored code:

"use strict";
const MongoClient = require('mongodb').MongoClient;
const MONGODB_URI = 'mongodb://cam_writer:1%40kGM%26LL%26gA5y7NVk1cvl9@cluster0-shard-00-00-hlygq.mongodb.net:27017,cluster0-shard-00-01-hlygq.mongodb.net:27017,cluster0-shard-00-02-hlygq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin&retryWrites=true'; // or Atlas connection string

const AWS = require('aws-sdk')
const s3 = AWS.S3()
let cachedDb = null;

const connectToDatabase = uri => {

  console.log('=> connect to database');

  if (cachedDb) {
    console.log('=> using cached database instance');
    return Promise.resolve(cachedDb);
  }

  return MongoClient.connect(uri)
    .then(client => {
      cachedDb = client.db('events');
      return Promise.resolve(cachedDb);
    });

}

function insertIntoDb (db, obj) {
  console.log('=> inserting data into db');

  return db.collection('detection_events').insertOne(obj)
}

module.exports.handler = async (event) => {

  const db = await connectToDatabase(MONGODB_URI);

  const result = await insertIntoDb(db, {
    bucket: event.Records[0].s3.bucket.name,
    key: event.Records[0].s3.object.key
  })

  console.log(result)

  //finally get the HEAD for the s3Object
   const head = await s3.headObject({
     Bucket: event.Records[0].s3.bucket.name,
     Key: event.Records[0].s3.object.key
   }).promise();

   console.log(head)

  return {
    statusCode: 201,
    body: JSON.stringify(result)
  }

};

This should be enough to get you off ground.

EDIT: I recommend you take a look into the official NodeJS SDK as its documentation is quite good

EDIT 2: as per Michael's suggestion, if your files may contain whitespaces, then use decodeURIComponent(event.Records[0].s3.object.key.replace(/\+/g, " ")) instead of event.Records[0].s3.object.key

EDIT 3: Now that your code works, you said it adds a "bunch" of images to S3. S3 will fire as many events as inserted images. This means that N Lambdas will spin up concurrently, therefore creating a new MongoDB connection every time.

One workaround here is to set the limit of concurrent executions on your Lambda function to a lower number, so you can control how many connections can be open at the same time.

In order to do it, go to your Lambda's console and set Concurrency to whatever number you want (on the example below, I use 5). This should be enough get through with what you need.

enter image description here

Thales Minussi
  • 6,965
  • 1
  • 30
  • 48
  • 1
    In S3 events, the object key is presented in S3's semi-standard encoding format. The `+` character must be replaced with ASCII 0x20 (space) and the resulting value must be URL-decoded. The SDK will double-encode and the HEAD will fail if this isn't done **if** the object key has spaces or any unicode characters that S3 applies these internal rules to. The expression to correctly extract the key: `decodeURIComponent(event.Records[0].s3.object.key.replace(/\+/g, " "))` https://docs.aws.amazon.com/lambda/latest/dg/with-s3-example-deployment-pkg.html#with-s3-example-deployment-pkg-nodejs – Michael - sqlbot Mar 02 '19 at 19:15
  • 1
    You're correct as always, Michael. I left it out for simplicity reasons though, but I shouldn't be as lazy :). Funny enough, I ran a serverless workshop last week and I was not dealing with whitespaces, so of course, somebody uploaded a file with whitespaces on the name and ran into issues. I managed to fix with the regex above, but it was a lesson learned. Anyways, I think that applications should handle with the whitespaces upon upload, therefore avoiding these types of errors, after all, whitespaces on filenames is considered to be a bad practice. – Thales Minussi Mar 02 '19 at 19:20
  • I have edited my question accordingly, @Michael-sqlbot – Thales Minussi Mar 02 '19 at 19:22
  • Agreed, spaces are bad news in general... but utf-8 multibyte characters and possibly others can also get caught up in this encoding. Shrödinger becomes Shr%C3%B6dinger, etc., so the unescaping is always necessary to avoid a Heisenbug. (You see what I did there?) – Michael - sqlbot Mar 02 '19 at 20:32
  • 1
    Schrodinger’s cat and Heisenbug? :) – Thales Minussi Mar 02 '19 at 21:00
  • Hi, I implemented this code and it seems to be working, but the connections are still increasing to my mongodb. Seems like the lambda is not reusing the connections as it should be, any idea why this is happening? Doesn't the cachedDB get remembered by the lambda somehow? Have edited my question to include the full code – A_toaster Mar 03 '19 at 22:06
  • @ThalesMinussi sorry forgot to tag you in the above – A_toaster Mar 03 '19 at 22:14
  • 1
    @A_toaster it depends. If you are triggering concurrent connections, a new Lambda container will spin up, therefore creating one more connection on your MongoDB. It will only reuse the same connection if it uses the same container. – Thales Minussi Mar 03 '19 at 22:23
  • 1
    Please check this answer as I explain roughly how caching / containers work, so you may get a better understanding of what may be going on: https://stackoverflow.com/questions/54947095/aws-lambda-mysql-caching/54947780#54947780 – Thales Minussi Mar 03 '19 at 22:25
  • @A_toaster I have edited my question with a few more bits of information, since you said that your code inserts many images to s3. Hope this helps! – Thales Minussi Mar 03 '19 at 22:32
  • @ThalesMinussi Thanks for the details! I probably should have mentioned the use case to help my question: I am using IoT cameras to insert images into S3. This lambda is then designed to make an entry in my mongodb for every image. Since there's a pretty high rate of images being put into the S3, won't limiting the concurrency cause a bottleneck on the metadata being inserted into the mongodb? – A_toaster Mar 03 '19 at 22:49
  • Let us [continue this discussion in chat](https://chat.stackoverflow.com/rooms/189361/discussion-between-thales-minussi-and-a-toaster). – Thales Minussi Mar 03 '19 at 22:50