I've written some CDK code to programmatically create a data pipeline that backs up a DynamoDB table into an S3 bucket on a daily basis.
But it keeps running into this error:
amazonaws.datapipeline.taskrunner.TaskExecutionException: Failed to complete EMR transform. at amazonaws.datapipeline.activity.EmrActivity.runActivity(EmrActivity.java:67) at amazonaws.datapipeline.objects.AbstractActivity.run(AbstractActivity.java:16) at amazonaws.datapipeline.taskrunner.TaskPoller.executeRemoteRunner(TaskPoller.java:136) at amazonaws.datapipeline.taskrunner.TaskPoller.executeTask(TaskPoller.java:105) at amazonaws.datapipeline.taskrunner.TaskPoller$1.run(TaskPoller.java:81) at private.com.amazonaws.services.datapipeline.poller.PollWorker.executeWork(PollWorker.java:76) at private.com.amazonaws.services.datapipeline.poller.PollWorker.run(PollWorker.java:53) at java.lang.Thread.run(Thread.java:750) Caused by:
....
fatal error: An error occurred (AccessDenied) when calling the ListObjectsV2 operation: Access Denied at amazonaws.datapipeline.activity.mapreduce.HadoopJobRunner.executeCommand(HadoopJobRunner.java:175) at amazonaws.datapipeline.activity.mapreduce.HadoopJobRunner.ex
I'm using the DataPipelineDefaultResourceRole
and DataPipelineDefaultRole
for this data pipeline which has S3:* permission, so I'm puzzled why this is happening.
On top of that, I'm not sure why logging is not enabled on my EMR cluster which is spun off by this data pipeline although I've specified the logLocation parameter: myLogUri
Any pointers please?
import { CfnPipeline } from "monocdk/aws-datapipeline";
private createDataPipeline(props: InfrastructureStackProps) {
const dataPipelineName = "a-nice-datapipeline8.23";
const pipeline = new CfnPipeline(this, dataPipelineName, {
name: dataPipelineName,
parameterObjects: [
{
id: "myDDBTableName",
attributes: [
{
key: "Description",
stringValue: "Source table"
},
{
key: "Type",
stringValue: "String"
},
{
key: "Default",
stringValue: "Attributes"
}
]
},
{
id: "myOutputS3Location",
attributes: [
{
key: "Description",
stringValue: "Output S3 Location"
},
{
key: "Type",
stringValue: "String"
},
{
key: "Default",
stringValue: "s3://ddb-table-backup/"
}
]
},
{
id: "myDdbReadThroughputRatio",
attributes: [
{
key: "Description",
stringValue: "DynamoDB Read Throughput Ratio"
},
{
key: "Type",
stringValue: "Double"
},
{
key: "Default",
stringValue: "0.15"
}
]
},
{
id: 'myLogUri',
attributes: [
{
key: 'type',
stringValue: 'AWS::S3::ObjectKey',
},
{
key: 'description',
stringValue: 'DataPipeline Log Uri',
},
],
},
{
id: "myDDBRegion",
attributes: [
{
key: "Description",
stringValue: "Region of the DynamoDB Table"
},
{
key: "Type",
stringValue: "String"
},
{
key: "Default",
stringValue: props.region
}
]
}
],
parameterValues: [
{
id: "myDDBTableName",
stringValue: "Attributes"
},
{
id: "myOutputS3Location",
stringValue: "s3://ddb-table-backup/"
},
{
id: "myDdbReadThroughputRatio",
stringValue: "0.15"
},
{
id: 'myLogUri',
stringValue: `s3://data_pipeline_log/`,
},
{
id: "myDDBRegion",
stringValue: props.region
}
],
pipelineObjects: [
{
"id": "EmrClusterForBackup",
"name": "EmrClusterForBackup",
"fields": [
{
"key": "resourceRole",
"stringValue": "DataPipelineDefaultResourceRole"
},
{
"key": "role",
"stringValue": "DataPipelineDefaultRole"
},
{
"key": "coreInstanceCount",
"stringValue": "1"
},
{
"key": "coreInstanceType",
"stringValue": "m4.xlarge"
},
{
"key": "releaseLabel",
"stringValue": "emr-5.29.0"
},
{
"key": "masterInstanceType",
"stringValue": "m4.xlarge"
},
{
"key": "region",
"stringValue": props.region
},
{
"key": "type",
"stringValue": "EmrCluster"
},
{
"key": "terminateAfter",
"stringValue": "2 Hours"
}
]
},
{
"id": "S3BackupLocation",
"name": "S3BackupLocation",
"fields": [
{
"key": "directoryPath",
"stringValue": "s3://ddb-table-backup/"
},
{
"key": "type",
"stringValue": "S3DataNode"
}
]
},
{
"id": "DDBSourceTable",
"name": "DDBSourceTable",
"fields": [
{
"key": "readThroughputPercent",
"stringValue": "0.15"
},
{
"key": "type",
"stringValue": "DynamoDBDataNode"
},
{
"key": "tableName",
"stringValue": "Attributes"
}
]
},
{
"id": "Default",
"name": "Default",
"fields": [
{
"key": "failureAndRerunMode",
"stringValue": "CASCADE"
},
{
"key": "resourceRole",
"stringValue": "DataPipelineDefaultResourceRole"
},
{
"key": "role",
"stringValue": "DataPipelineDefaultRole"
},
{
"key": "scheduleType",
"stringValue": "cron"
},
{
key: 'schedule',
refValue: 'DailySchedule'
},
{
key: 'pipelineLogUri',
stringValue: 's3://data_pipeline_log/',
},
{
"key": "type",
"stringValue": "Default"
}
]
},
{
"name": "Every 1 day",
"id": "DailySchedule",
"fields": [
{
"key": 'type',
"stringValue": 'Schedule'
},
{
"key": 'period',
"stringValue": '1 Day'
},
{
"key": 'startDateTime',
"stringValue": "2021-12-20T00:00:00"
}
]
},
{
"id": "TableBackupActivity",
"name": "TableBackupActivity",
"fields": [
{
"key": "type",
"stringValue": "EmrActivity"
},
{
"key": "output",
"refValue": "S3BackupLocation"
},
{
"key": "input",
"refValue": "DDBSourceTable"
},
{
"key": "maximumRetries",
"stringValue": "2"
},
{
"key": "preStepCommand",
"stringValue": "(sudo yum -y update aws-cli) && (aws s3 rm #{output.directoryPath} --recursive)"
},
{
"key": "step",
"stringValue": "s3://dynamodb-dpl-#{myDDBRegion}/emr-ddb-storage-handler/4.11.0/emr-dynamodb-tools-4.11.0-SNAPSHOT-jar-with-dependencies.jar,org.apache.hadoop.dynamodb.tools.DynamoDBExport,#{output.directoryPath},#{input.tableName},#{input.readThroughputPercent}"
},
{
"key": "runsOn",
"refValue": "EmrClusterForBackup"
},
{
"key": "resizeClusterBeforeRunning",
"stringValue": "false"
}
]
}
],
activate: true
});
return pipeline;
}