I am trying to use Terraform to deploy Azure Databricks workspace and cluster. The workspace got created successfully along with the user group and i am able to login to Databricks successfully.
The problem i am having is while creating the cluster. Terraform apply fails after about 20 mins or so.
"public_network_access_enabled" is set to "True" in my terraform code for my workspace
If i look at the console of Azure DB, i see under Compute that the state is pending and it stays in pending state for around 20 mins and then everything disappears and terraform apply throws this error message below
Just for information we are operating in a landing zone and i have created two subnets for Databricks, one private and one public with /26 The vnet is peered to the hub vnet as it is in any landing zone.
[![1114-001756-9y2ijklp is not able to transition from TERMINATED to RUNNING: Instance was not reachable. This can be a transient networking issue. If the problem persists, this usually indicates a network environment misconfiguration. Please check your cloud provider config..., Termination info : code: INSTANCE_UNREACHABLE, type: , parameters: map\[databricks_error_message:
Instance was not reachable.
VM extension code: ProvisioningState/succeded
instanceId: InstanceId(939d10dc729547c5bedb7cf32be7ecd1)
workerEnv: workerenv-3791028417964786][1]][1]
Additional details (may be truncated):
The rest of the error is attached in the screenshot.
Just to inform, i tried logging in to Databricks interactively from the Azure portal and tried creating the cluster manually and it succeeds. It succeeds means i can see the cluster at least which i wasn't able to see while trying to deploy using Terraform. But the issue is when i try to start the cluster, it tries to start but after around 10 mins or so, it shows terminated
If i check the Event Log inside Databricks it says :
Cluster terminated.Reason: Instances unreachable
My Terraform Code:
resource "azurerm_databricks_workspace" "db-workspace" {
name = module.names-db-workspace.environment.databricks_workspace.name_unique
resource_group_name = module.resourcegroup.resource_group.name
location = module.resourcegroup.resource_group.location
sku = "premium"
public_network_access_enabled = true
custom_parameters {
no_public_ip = false
virtual_network_id = module.virtualnetwork["centralus"].virtual_network.self.id
public_subnet_name = module.virtualnetwork["centralus"].virtual_network.subnets["db-sub-1-public"].name
private_subnet_name = module.virtualnetwork["centralus"].virtual_network.subnets["db-sub-2-private"].name
public_subnet_network_security_group_association_id = module.virtualnetwork["centralus"].virtual_network.nsgs.associations.subnets["databricks-public-nsg-db-sub-1-public"].id
private_subnet_network_security_group_association_id = module.virtualnetwork["centralus"].virtual_network.nsgs.associations.subnets["databricks-private-nsg-db-sub-2-private"].id
}
tags = local.tags
}
resource "databricks_cluster" "dbcselfservice" {
cluster_name = format("adb-cluster-%s-%s", var.project.name, var.project.environment.name)
spark_version = var.spark_version
node_type_id = var.node_type_id
autotermination_minutes = 20
autoscale {
min_workers = 1
max_workers = 7
}
azure_attributes {
availability = "SPOT_AZURE"
first_on_demand = 1
spot_bid_max_price = 100
}
depends_on = [
azurerm_databricks_workspace.db-workspace
]
}
locals {
vnet = {
enable = true
subnets = {
general = {
cidrs = [cidrsubnet(var.project.cidrs["centralus-default"][0], 2, 0)]
private = { endpoint = true, service = false }
service = { endpoints = [
"Microsoft.KeyVault",
"Microsoft.Storage",
"Microsoft.Web",
"Microsoft.EventHub",
"Microsoft.Sql",
"Microsoft.AzureCosmosDB"
],
delegations = {}
}
}
webapp = {
cidrs = [cidrsubnet(var.project.cidrs["centralus-default"][0], 2, 1)]
private = { endpoint = false, service = false }
service = { endpoints = [], delegations = {
"Microsoft.Web/serverFarms" = {
actions = ["Microsoft.Network/virtualNetworks/subnets/action"]
name = "Microsoft.Web/serverFarms"
}
}
}
}
# waf = {
# cidrs = [cidrsubnet(var.project.cidrs["centralus-default"][0], 4, 8)]
# private = { endpoint = false, service = false }
# service = { endpoints = [], delegations = {}
# }
# }
db-sub-1-public = {
cidrs = [cidrsubnet(var.project.cidrs["centralus-default"][0], 2, 2)]
private = { endpoint = false, service = false }
service = {
endpoints = [],
delegations = {
"Microsoft.Databricks/workspaces" = {
actions = ["Microsoft.Network/virtualNetworks/subnets/join/action", "Microsoft.Network/virtualNetworks/subnets/prepareNetworkPolicies/action"]
name = "Microsoft.Databricks/workspaces"
}
}
}
}
db-sub-2-private = {
cidrs = [cidrsubnet(var.project.cidrs["centralus-default"][0], 2, 3)]
private = { endpoint = false, service = false }
service = {
endpoints = [],
delegations = {
"Microsoft.Databricks/workspaces" = {
actions = ["Microsoft.Network/virtualNetworks/subnets/join/action", "Microsoft.Network/virtualNetworks/subnets/prepareNetworkPolicies/action"]
name = "Microsoft.Databricks/workspaces"
}
}
}
}
}
asgs = {}
nsgs = {
databricks-public-nsg = {
subnets = ["db-sub-1-public"]
rules = {
# "databricks-worker-to-webapp" = {
# priority = 101
# access = "Allow"
# direction = "Outbound"
# protocol = "tcp"
# destination = {
# ports = ["443"]
# asgs = []
# prefix = "AzureDatabricks"
# }
# source = {
# ports = ["*"]
# asgs = []
# prefix = "VirtualNetwork"
# }
# }
# "databricks-worker-to-storage" = {
# priority = 104
# access = "Allow"
# direction = "Outbound"
# protocol = "tcp"
# destination = {
# ports = ["443"]
# asgs = []
# prefix = "Storage"
# }
# source = {
# ports = ["*"]
# asgs = []
# prefix = "VirtualNetwork"
# }
# }
# "databricks-worker-to-sql" = {
# priority = 106
# access = "Allow"
# direction = "Outbound"
# protocol = "tcp"
# destination = {
# ports = ["3306"]
# asgs = []
# prefix = "Sql"
# }
# source = {
# ports = ["*"]
# asgs = []
# prefix = "VirtualNetwork"
# }
# }
# "databricks-worker-within-cluster" = {
# priority = 100
# access = "Allow"
# direction = "Inbound"
# protocol = "tcp"
# destination = {
# ports = ["*"]
# asgs = []
# prefix = "VirtualNetwork"
# }
# source = {
# ports = ["*"]
# asgs = []
# prefix = "VirtualNetwork"
# }
# }
# "databricks-worker-within-cluster" = {
# priority = 108
# access = "Allow"
# direction = "Outbound"
# protocol = "tcp"
# destination = {
# ports = ["*"]
# asgs = []
# prefix = "VirtualNetwork"
# }
# source = {
# ports = ["*"]
# asgs = []
# prefix = "VirtualNetwork"
# }
# }
# "databricks-worker-to-event-hubs" = {
# priority = 110
# access = "Allow"
# direction = "Outbound"
# protocol = "tcp"
# destination = {
# ports = ["9093"]
# asgs = []
# prefix = "Eventhubs"
# }
# source = {
# ports = ["*"]
# asgs = []
# prefix = "VirtualNetwork"
# }
}
}
databricks-private-nsg = {
subnets = ["db-sub-2-private"]
rules = {
# "databricks-worker-to-webapp" = {
# priority = 101
# access = "Allow"
# direction = "Outbound"
# protocol = "tcp"
# destination = {
# ports = ["443"]
# asgs = []
# prefix = "AzureDatabricks"
# }
# source = {
# ports = ["*"]
# asgs = []
# prefix = "VirtualNetwork"
# }
# }
# "databricks-worker-to-storage" = {
# priority = 104
# access = "Allow"
# direction = "Outbound"
# protocol = "tcp"
# destination = {
# ports = ["443"]
# asgs = []
# prefix = "Storage"
# }
# source = {
# ports = ["*"]
# asgs = []
# prefix = "VirtualNetwork"
# }
# }
# "databricks-worker-to-sql" = {
# priority = 106
# access = "Allow"
# direction = "Outbound"
# protocol = "tcp"
# destination = {
# ports = ["3306"]
# asgs = []
# prefix = "Sql"
# }
# source = {
# ports = ["*"]
# asgs = []
# prefix = "VirtualNetwork"
# }
# }
# "databricks-worker-within-cluster" = {
# priority = 100
# access = "Allow"
# direction = "Inbound"
# protocol = "tcp"
# destination = {
# ports = ["*"]
# asgs = []
# prefix = "VirtualNetwork"
# }
# source = {
# ports = ["*"]
# asgs = []
# prefix = "VirtualNetwork"
# }
# }
# "databricks-worker-within-cluster" = {
# priority = 108
# access = "Allow"
# direction = "Outbound"
# protocol = "tcp"
# destination = {
# ports = ["*"]
# asgs = []
# prefix = "VirtualNetwork"
# }
# source = {
# ports = ["*"]
# asgs = []
# prefix = "VirtualNetwork"
# }
# }
# "databricks-worker-to-event-hubs" = {
# priority = 110
# access = "Allow"
# direction = "Outbound"
# protocol = "tcp"
# destination = {
# ports = ["9093"]
# asgs = []
# prefix = "Eventhubs"
# }
# source = {
# ports = ["*"]
# asgs = []
# prefix = "VirtualNetwork"
# }
}
}
}
}
}
module "virtualnetwork" {
version = "~> 7.0"
source = "contoso.com/virtual-network/azurerm"
for_each = (local.vnet.enable) ? { centralus = "Central US" } : {}
providers = { azurerm = azurerm, azurerm.hub = azurerm.hub, random = random }
hub_resource_group_name = var.project.hub.resourcegroup.name
hub_virtual_hub_name = var.project.hub.virtualhub.name
hub_ddos_protection_plan_name = var.project.hub.ddosprotectionplan.name
resource_group_name = module.resourcegroup.resource_group.name
virtual_network_location = each.value
virtual_network_environment = var.project.environment.name
virtual_network_name = var.project.name
virtual_network_tags = module.resourcegroup.resource_group.tags
virtual_network_cidrs = var.project.cidrs[format("%s-default", each.key)]
virtual_network_subnets = local.vnet.subnets
application_security_groups = local.vnet.asgs
virtual_network_security_groups = local.vnet.nsgs
}