Currently the logs are stored in DynamoDB. We want to filter out unnecessary rows from that table and store output in different table (f.e. exclude rows that "value" field contains "bot", "python", "requests", etc).
By this moment I came up with something like this (aws template):
{
"objects": [
{
"name": "EmrClusterForBackup",
"coreInstanceType": "m1.medium",
"coreInstanceCount": "1",
"masterInstanceType": "m1.medium",
"amiVersion": "3.3.2",
"id": "EmrClusterForBackup",
"type": "EmrCluster",
"terminateAfter": "2 Hours"
},
{
"occurrences": "1",
"period": "1 Day",
"name": "RunOnce",
"id": "DefaultSchedule",
"type": "Schedule",
"startAt": "FIRST_ACTIVATION_DATE_TIME"
},
{
"name": "DDBExportFormat",
"id": "DDBExportFormat",
"type": "DynamoDBExportDataFormat"
},
{
"directoryPath": "#{myOutputS3Loc}/#{format(@scheduledStartTime, 'YYYY-MM-dd-HH-mm-ss')}",
"dataFormat": {
"ref": "DDBExportFormat"
},
"name": "S3BackupLocation",
"id": "S3BackupLocation",
"type": "S3DataNode"
},
{
"failureAndRerunMode": "CASCADE",
"schedule": {
"ref": "DefaultSchedule"
},
"resourceRole": "DataPipelineDefaultResourceRole",
"role": "DataPipelineDefaultRole",
"pipelineLogUri": "s3://ti-labs-ml-data/logs/",
"scheduleType": "cron",
"name": "Default",
"id": "Default"
},
{
"output": {
"ref": "S3BackupLocation"
},
"input": {
"ref": "DDBSourceTable"
},
"filterSql": "",
"name": "TableBackupActivity",
"id": "TableBackupActivity",
"runsOn": {
"ref": "EmrClusterForBackup"
},
"type": "HiveCopyActivity",
"resizeClusterBeforeRunning": "true"
},
{
"readThroughputPercent": "#{myDDBReadThroughputRatio}",
"dataFormat": {
"ref": "DDBExportFormat"
},
"name": "DDBSourceTable",
"id": "DDBSourceTable",
"type": "DynamoDBDataNode",
"tableName": "#{myDDBTableName}"
}
],
"parameters": [
{
"description": "Output S3 folder",
"id": "myOutputS3Loc",
"type": "AWS::S3::ObjectKey"
},
{
"default": "0.2",
"watermark": "Enter value between 0.1-1.0",
"description": "DynamoDB read throughput ratio",
"id": "myDDBReadThroughputRatio",
"type": "Double"
},
{
"description": "DynamoDB table name",
"id": "myDDBTableName",
"type": "String"
}
],
"values": {
"myDDBTableName": "TI-LABS-DDB-A",
"myDDBReadThroughputRatio": "0.2",
"myOutputS3Loc": "s3://ti-labs-ml-data/"
}
}
However I don't understand how the filter query should look like (i've tried one - it says that row has "Item" only, whereas my table has 2 fields - id and value).