I'm using google.cloud dataprocv1 and trying to create a dataproc cluster. I have to clone a github repo into the cluster, so using Initialization actions, I'm trying to run a bash script which is in Storage bucket of GCP. I'm seeing this error when the code is executed.
`Error:
Traceback (most recent call last):
File "<string>", line 1, in <module>
File "/app/dataproc_creation.py", line 35, in create_cluster
operation = cluster_client.create_cluster(
File "/usr/local/lib/python3.10/site-packages/google/cloud/dataproc_v1/services/cluster_controller/client.py", line 605, in create_cluster
request = clusters.CreateClusterRequest(request)
File "/usr/local/lib/python3.10/site-packages/proto/message.py", line 598, in __init__
pb_value = marshal.to_proto(pb_type, value)
File "/usr/local/lib/python3.10/site-packages/proto/marshal/marshal.py", line 217, in to_proto
pb_value = rule.to_proto(value)
File "/usr/local/lib/python3.10/site-packages/proto/marshal/rules/message.py", line 36, in to_proto
return self._descriptor(**value)
ValueError: Protocol message NodeInitializationAction has no "executableFile" field.
Error: exit status 1
`
Function for cluster creation:
def create_cluster(project_id, region, cluster_name):
# Create a client with the endpoint set to the desired cluster region.
cluster_client = dataproc.ClusterControllerClient(
client_options={"api_endpoint": f"{region}-dataproc.googleapis.com:443"}
)
# Create the cluster config.
cluster = {
"project_id": project_id,
"cluster_name": cluster_name,
"config": {
"master_config": {"num_instances": 1, "machine_type_uri": "n1-standard-2"},
"worker_config": {"num_instances": 2, "machine_type_uri": "n1-standard-2"},
"initializationActions": [
{
"executableFile": "gs://extracted-bucket/init_actions/git_clone.sh"
}
]
},
}
# Create the cluster.
operation = cluster_client.create_cluster(
request={"project_id": project_id, "region": region, "cluster": cluster}
)
result = operation.result()
# Output a success message.
print(f"Cluster created successfully: {result.cluster_name}")
Without the initialization actions, the cluster is getting created. But with the init actions, the error occurred. Tried passing the executableFile as below
init_action = {
'executableFile': 'gs://<BUCKET_NAME>/init_actions/git_clone.py',
'executionTimeout': '3600s'
}
and passing the init_actions in the config
cluster_config = {
"gce_cluster_config": {...},
"master_config": {...},
"worker_config": {...},
"initialization_actions": [init_action]
}