How to filter out data from DynamoDB using Amazon Data Pipeline and Hive?

Question

Currently the logs are stored in DynamoDB. We want to filter out unnecessary rows from that table and store output in different table (f.e. exclude rows that "value" field contains "bot", "python", "requests", etc).

By this moment I came up with something like this (aws template):

{
  "objects": [
    {
      "name": "EmrClusterForBackup",
      "coreInstanceType": "m1.medium",
      "coreInstanceCount": "1",
      "masterInstanceType": "m1.medium",
      "amiVersion": "3.3.2",
      "id": "EmrClusterForBackup",
      "type": "EmrCluster",
      "terminateAfter": "2 Hours"
    },
    {
      "occurrences": "1",
      "period": "1 Day",
      "name": "RunOnce",
      "id": "DefaultSchedule",
      "type": "Schedule",
      "startAt": "FIRST_ACTIVATION_DATE_TIME"
    },
    {
      "name": "DDBExportFormat",
      "id": "DDBExportFormat",
      "type": "DynamoDBExportDataFormat"
    },
    {
      "directoryPath": "#{myOutputS3Loc}/#{format(@scheduledStartTime, 'YYYY-MM-dd-HH-mm-ss')}",
      "dataFormat": {
        "ref": "DDBExportFormat"
      },
      "name": "S3BackupLocation",
      "id": "S3BackupLocation",
      "type": "S3DataNode"
    },
    {
      "failureAndRerunMode": "CASCADE",
      "schedule": {
        "ref": "DefaultSchedule"
      },
      "resourceRole": "DataPipelineDefaultResourceRole",
      "role": "DataPipelineDefaultRole",
      "pipelineLogUri": "s3://ti-labs-ml-data/logs/",
      "scheduleType": "cron",
      "name": "Default",
      "id": "Default"
    },
    {
      "output": {
        "ref": "S3BackupLocation"
      },
      "input": {
        "ref": "DDBSourceTable"
      },
      "filterSql": "",
      "name": "TableBackupActivity",
      "id": "TableBackupActivity",
      "runsOn": {
        "ref": "EmrClusterForBackup"
      },
      "type": "HiveCopyActivity",
      "resizeClusterBeforeRunning": "true"
    },
    {
      "readThroughputPercent": "#{myDDBReadThroughputRatio}",
      "dataFormat": {
        "ref": "DDBExportFormat"
      },
      "name": "DDBSourceTable",
      "id": "DDBSourceTable",
      "type": "DynamoDBDataNode",
      "tableName": "#{myDDBTableName}"
    }
  ],
  "parameters": [
    {
      "description": "Output S3 folder",
      "id": "myOutputS3Loc",
      "type": "AWS::S3::ObjectKey"
    },
    {
      "default": "0.2",
      "watermark": "Enter value between 0.1-1.0",
      "description": "DynamoDB read throughput ratio",
      "id": "myDDBReadThroughputRatio",
      "type": "Double"
    },
    {
      "description": "DynamoDB table name",
      "id": "myDDBTableName",
      "type": "String"
    }
  ],
  "values": {
    "myDDBTableName": "TI-LABS-DDB-A",
    "myDDBReadThroughputRatio": "0.2",
    "myOutputS3Loc": "s3://ti-labs-ml-data/"
  }
}

However I don't understand how the filter query should look like (i've tried one - it says that row has "Item" only, whereas my table has 2 fields - id and value).

score -2 · Answer 1 · edited Sep 08 '15 at 19:40

-2

The filterSql corresponds to the where condition in the sql query. So the filterSql in your case would look like

value not in ('bot', 'python', 'requests')

Please see http://docs.aws.amazon.com/datapipeline/latest/DeveloperGuide/dp-object-hivecopyactivity.html

edited Sep 08 '15 at 19:40

Alois Mahdal

10,763
7
51
69

answered Sep 08 '15 at 18:49

Ramkumar K Sugavanam

56
1

How to filter out data from DynamoDB using Amazon Data Pipeline and Hive?

1 Answers1