diff --git a/.circleci/config.yml b/.circleci/config.yml index ba6cff7..3f764d9 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -10,7 +10,7 @@ jobs: docker: - image: microsoft/azure-cli environment: - TERRAFORM_VERSION: 0.12.24 + TERRAFORM_VERSION: 0.12.30 steps: - checkout @@ -19,13 +19,12 @@ jobs: name: Install Terraform command: | wget https://releases.hashicorp.com/terraform/${TERRAFORM_VERSION}/terraform_${TERRAFORM_VERSION}_linux_amd64.zip - unzip terraform_${TERRAFORM_VERSION}_linux_amd64.zip unzip terraform_${TERRAFORM_VERSION}_linux_amd64.zip -d /usr/local/bin - run: name: Configure test cluster env command: | - echo 'export TF_VAR_aks_tags=\{\"CIRCLE_BUILD_URL\"=\"${CIRCLE_BUILD_URL}\",\"CIRCLE_PR_NUMBER\"=\"${CIRCLE_PR_NUMBER}\",\"CIRCLE_PULL_REQUEST\"=\"${CIRCLE_PULL_REQUEST}\",\"CIRCLE_REPOSITORY_URL\"=\"${CIRCLE_REPOSITORY_URL}\"\}' >> $BASH_ENV + echo 'export TF_VAR_tags=\{\"CIRCLE_BUILD_URL\"=\"${CIRCLE_BUILD_URL}\",\"CIRCLE_PR_NUMBER\"=\"${CIRCLE_PR_NUMBER}\",\"CIRCLE_PULL_REQUEST\"=\"${CIRCLE_PULL_REQUEST}\",\"CIRCLE_REPOSITORY_URL\"=\"${CIRCLE_REPOSITORY_URL}\"\}' >> $BASH_ENV echo 'export WORKSPACE=azure-aks-circleci-${CIRCLE_BUILD_NUM}' >> $BASH_ENV - run: @@ -40,6 +39,10 @@ jobs: name: Terraform init command: terraform init + - run: + name: Terraform validate + command: terraform validate + - run: name: Terraform workspace create command: terraform workspace new ${WORKSPACE} @@ -61,4 +64,3 @@ jobs: name: Terraform workspace delete command: terraform workspace select default && terraform workspace delete ${WORKSPACE} when: always - diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 87a8a2c..c2fe172 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,4 +1,4 @@ # CODEOWNERS # https://help.github.com/en/github/creating-cloning-and-archiving-repositories/about-code-owners#codeowners-file-location -* @zs-ddl @secretions @steved +* @dominodatalab/platform diff --git a/aks.tf b/aks.tf new file mode 100644 index 0000000..94505fc --- /dev/null +++ b/aks.tf @@ -0,0 +1,92 @@ +resource "azurerm_kubernetes_cluster" "aks" { + lifecycle { + ignore_changes = [ + tags, + default_node_pool[0].node_count, + default_node_pool[0].max_count, + default_node_pool[0].tags, + # VM Size changes cause recreation of the entire cluster + default_node_pool[0].vm_size + ] + } + + name = local.cluster_name + location = local.resource_group.location + resource_group_name = local.resource_group.name + dns_prefix = local.cluster_name + private_cluster_enabled = false + sku_tier = var.cluster_sku_tier + + api_server_authorized_ip_ranges = var.api_server_authorized_ip_ranges + + default_node_pool { + enable_node_public_ip = var.node_pools.platform.enable_node_public_ip + name = "platform" + node_count = var.node_pools.platform.min_count + node_labels = var.node_pools.platform.node_labels + vm_size = var.node_pools.platform.vm_size + availability_zones = var.node_pools.platform.zones + os_disk_size_gb = var.node_pools.platform.os_disk_size_gb + node_taints = var.node_pools.platform.node_taints + enable_auto_scaling = var.node_pools.platform.enable_auto_scaling + min_count = var.node_pools.platform.min_count + max_count = var.node_pools.platform.max_count + max_pods = var.node_pools.platform.max_pods + tags = local.tags + } + + identity { + type = "SystemAssigned" + } + + addon_profile { + kube_dashboard { + enabled = false + } + + oms_agent { + enabled = true + log_analytics_workspace_id = azurerm_log_analytics_workspace.logs.id + } + } + + network_profile { + load_balancer_sku = "Standard" + network_plugin = "azure" + network_policy = "calico" + dns_service_ip = "100.97.0.10" + docker_bridge_cidr = "172.17.0.1/16" + service_cidr = "100.97.0.0/16" + } + + tags = local.tags +} + +resource "azurerm_kubernetes_cluster_node_pool" "aks" { + lifecycle { + ignore_changes = [node_count, max_count, tags] + } + + for_each = { + # Create all node pools except for 'platform' because it is the AKS default + for key, value in var.node_pools : + key => value + if key != "platform" + } + + enable_node_public_ip = each.value.enable_node_public_ip + kubernetes_cluster_id = azurerm_kubernetes_cluster.aks.id + name = each.key + node_count = each.value.min_count + vm_size = each.value.vm_size + availability_zones = each.value.zones + os_disk_size_gb = each.value.os_disk_size_gb + os_type = each.value.node_os + node_labels = each.value.node_labels + node_taints = each.value.node_taints + enable_auto_scaling = each.value.enable_auto_scaling + min_count = each.value.min_count + max_count = each.value.max_count + max_pods = each.value.max_pods + tags = local.tags +} diff --git a/main.tf b/main.tf index 312e9f6..7ede680 100644 --- a/main.tf +++ b/main.tf @@ -1,10 +1,10 @@ -provider "azurerm" { - version = ">=2.7.0" - partner_id = "31912fbf-f6dd-5176-bffb-0a01e8ac71f2" - features {} -} - terraform { + required_providers { + azurerm = { + version = "~> 2.46" + } + } + backend "azurerm" { resource_group_name = "dominoterraform" storage_account_name = "dominoterraformstorage" @@ -13,16 +13,20 @@ terraform { } } +provider "azurerm" { + partner_id = "31912fbf-f6dd-5176-bffb-0a01e8ac71f2" + features {} +} + locals { cluster_name = var.cluster_name != null ? var.cluster_name : terraform.workspace resource_group = var.resource_group_name != null ? data.azurerm_resource_group.k8s[0] : azurerm_resource_group.k8s[0] - # Terraform doesn't accept backslash escapes inside the replace function here for some reason, - # nor does it allow for single quotes. This does, somehow, work, but messes up syntax highlighting. - storage_account_name = var.storage_account_name != null ? var.storage_account_name : substr("${replace(local.cluster_name, "/[_-]/", "")}dominostorage", 0, 24) -} -#" this comment is to fix syntax highlighting + safe_storage_cluster_name = replace(local.cluster_name, "/[_-]/", "") + storage_account_name = var.storage_account_name != null ? var.storage_account_name : substr("${local.safe_storage_cluster_name}dominostorage", 0, 24) + tags = merge({ "Cluster" : local.cluster_name }, var.tags) +} data "azurerm_resource_group" "k8s" { count = var.resource_group_name != null ? 1 : 0 @@ -33,31 +37,7 @@ resource "azurerm_resource_group" "k8s" { count = var.resource_group_name == null ? 1 : 0 name = local.cluster_name location = var.location -} - -resource "random_id" "log_analytics_workspace_name_suffix" { - byte_length = 8 -} - -resource "azurerm_log_analytics_workspace" "logs" { - # The WorkSpace name has to be unique across the whole of azure, not just the current subscription/tenant. - name = "${var.log_analytics_workspace_name}-${random_id.log_analytics_workspace_name_suffix.dec}" - location = var.log_analytics_workspace_location - resource_group_name = local.resource_group.name - sku = var.log_analytics_workspace_sku -} - -resource "azurerm_log_analytics_solution" "logs" { - solution_name = "ContainerInsights" - location = azurerm_log_analytics_workspace.logs.location - resource_group_name = local.resource_group.name - workspace_resource_id = azurerm_log_analytics_workspace.logs.id - workspace_name = azurerm_log_analytics_workspace.logs.name - - plan { - publisher = "Microsoft" - product = "OMSGallery/ContainerInsights" - } + tags = local.tags } data "azurerm_subscription" "current" { @@ -69,121 +49,3 @@ resource "azurerm_role_assignment" "sp" { role_definition_name = "Network Contributor" principal_id = azurerm_kubernetes_cluster.aks.identity[0].principal_id } - -resource "azurerm_kubernetes_cluster" "aks" { - lifecycle { - ignore_changes = [ - default_node_pool[0].node_count - ] - } - - name = local.cluster_name - enable_pod_security_policy = false - location = local.resource_group.location - resource_group_name = local.resource_group.name - dns_prefix = local.cluster_name - private_cluster_enabled = false - - api_server_authorized_ip_ranges = var.api_server_authorized_ip_ranges - - default_node_pool { - enable_node_public_ip = var.node_pools.platform.enable_node_public_ip - name = "platform" - node_count = var.node_pools.platform.min_count - node_labels = merge({ "dominodatalab.com/node-pool" : "platform" }, var.node_pools.platform.node_labels) - vm_size = var.node_pools.platform.vm_size - availability_zones = var.node_pools.platform.zones - max_pods = 250 - os_disk_size_gb = 128 - node_taints = var.node_pools.platform.node_taints - enable_auto_scaling = var.node_pools.platform.enable_auto_scaling - min_count = var.node_pools.platform.min_count - max_count = var.node_pools.platform.max_count - tags = {} - } - - identity { - type = "SystemAssigned" - } - - addon_profile { - kube_dashboard { - enabled = false - } - - oms_agent { - enabled = true - log_analytics_workspace_id = azurerm_log_analytics_workspace.logs.id - } - } - - network_profile { - load_balancer_sku = "Standard" - network_plugin = "azure" - network_policy = "calico" - dns_service_ip = "100.97.0.10" - docker_bridge_cidr = "172.17.0.1/16" - service_cidr = "100.97.0.0/16" - } - - tags = merge({ Environment = "Development" }, var.aks_tags) - -} - -resource "azurerm_kubernetes_cluster_node_pool" "aks" { - lifecycle { - ignore_changes = [ - node_count - ] - } - - for_each = { - # Create all node pools except for 'platform' because it is the AKS default - for key, value in var.node_pools : - key => value - if key != "platform" - } - - enable_node_public_ip = each.value.enable_node_public_ip - kubernetes_cluster_id = azurerm_kubernetes_cluster.aks.id - name = each.key - node_count = each.value.min_count - vm_size = each.value.vm_size - availability_zones = each.value.zones - max_pods = 250 - os_disk_size_gb = 128 - os_type = each.value.node_os - node_labels = merge({ "dominodatalab.com/node-pool" : each.key }, each.value.node_labels) - node_taints = each.value.node_taints - enable_auto_scaling = each.value.enable_auto_scaling - min_count = each.value.min_count - max_count = each.value.max_count - tags = {} -} - -resource "azurerm_storage_account" "domino" { - name = local.storage_account_name - resource_group_name = local.resource_group.name - location = local.resource_group.location - account_kind = "StorageV2" - account_tier = var.storage_account_tier - account_replication_type = var.storage_account_replication_type - access_tier = "Hot" -} - -resource "azurerm_storage_container" "domino_containers" { - for_each = { - for key, value in var.containers : - key => value - } - - name = substr("${local.cluster_name}-${each.key}", 0, 63) - storage_account_name = azurerm_storage_account.domino.name - container_access_type = each.value.container_access_type - - lifecycle { - ignore_changes = [ - name - ] - } -} diff --git a/monitoring.tf b/monitoring.tf new file mode 100644 index 0000000..f19d898 --- /dev/null +++ b/monitoring.tf @@ -0,0 +1,115 @@ +resource "random_id" "log_analytics_workspace_name_suffix" { + byte_length = 8 +} + +resource "azurerm_log_analytics_workspace" "logs" { + # The WorkSpace name has to be unique across the whole of azure, not just the current subscription/tenant. + name = "${local.resource_group.name}-${random_id.log_analytics_workspace_name_suffix.dec}" + location = local.resource_group.location + resource_group_name = local.resource_group.name + sku = var.log_analytics_workspace_sku + tags = local.tags +} + +resource "azurerm_log_analytics_solution" "logs" { + solution_name = "ContainerInsights" + location = azurerm_log_analytics_workspace.logs.location + resource_group_name = local.resource_group.name + workspace_resource_id = azurerm_log_analytics_workspace.logs.id + workspace_name = azurerm_log_analytics_workspace.logs.name + + plan { + publisher = "Microsoft" + product = "OMSGallery/ContainerInsights" + } + + tags = local.tags + + lifecycle { + ignore_changes = [ + tags + ] + } +} + +resource "azurerm_monitor_diagnostic_setting" "control-plane" { + name = "AKS Control Plane Logging" + target_resource_id = azurerm_kubernetes_cluster.aks.id + log_analytics_workspace_id = azurerm_log_analytics_workspace.logs.id + + log { + category = "kube-apiserver" + + retention_policy { + enabled = true + days = 7 + } + } + + log { + category = "kube-controller-manager" + + retention_policy { + enabled = true + days = 7 + } + } + + log { + category = "kube-scheduler" + + retention_policy { + enabled = true + days = 7 + } + } + + log { + category = "cluster-autoscaler" + + retention_policy { + enabled = true + days = 7 + } + } + + log { + category = "guard" + enabled = false + + retention_policy { + enabled = false + days = 0 + } + } + + log { + category = "kube-audit" + enabled = false + + retention_policy { + enabled = false + days = 0 + } + } + + log { + category = "kube-audit-admin" + enabled = false + + retention_policy { + enabled = false + days = 0 + } + } + + metric { + category = "AllMetrics" + enabled = false + + retention_policy { + enabled = false + days = 0 + } + } +} diff --git a/storage.tf b/storage.tf new file mode 100644 index 0000000..231ba02 --- /dev/null +++ b/storage.tf @@ -0,0 +1,33 @@ +resource "azurerm_storage_account" "domino" { + name = local.storage_account_name + resource_group_name = local.resource_group.name + location = local.resource_group.location + account_kind = "StorageV2" + account_tier = var.storage_account_tier + account_replication_type = var.storage_account_replication_type + access_tier = "Hot" + tags = local.tags + + lifecycle { + ignore_changes = [ + tags + ] + } +} + +resource "azurerm_storage_container" "domino_containers" { + for_each = { + for key, value in var.containers : + key => value + } + + name = substr("${local.cluster_name}-${each.key}", 0, 63) + storage_account_name = azurerm_storage_account.domino.name + container_access_type = each.value.container_access_type + + lifecycle { + ignore_changes = [ + name + ] + } +} diff --git a/variables.tf b/variables.tf index 4f36e8e..4b5973d 100644 --- a/variables.tf +++ b/variables.tf @@ -1,14 +1,3 @@ - -variable "aks_tags" { - type = map(string) - default = {} - description = "AKS Key=Value tags" -} - -variable "agent_count" { - default = 3 -} - variable "api_server_authorized_ip_ranges" { type = list(string) description = "The IP ranges to whitelist for incoming traffic to the masters" @@ -20,6 +9,12 @@ variable "cluster_name" { description = "The Domino cluster name for the K8s cluster and resource group" } +variable "cluster_sku_tier" { + type = string + default = null + description = "The Domino cluster SKU (defaults to Free)" +} + variable "containers" { type = map(object({ container_access_type = string @@ -45,16 +40,7 @@ variable "location" { default = "West US 2" } -variable "log_analytics_workspace_name" { - default = "testLogAnalyticsWorkspaceName" -} - -# refer https://azure.microsoft.com/global-infrastructure/services/?products=monitor for log analytics available regions -variable "log_analytics_workspace_location" { - default = "eastus" -} - -# refer https://azure.microsoft.com/pricing/details/monitor/ for log analytics pricing +# refer https://azure.microsoft.com/pricing/details/monitor/ for log analytics pricing variable "log_analytics_workspace_sku" { default = "PerGB2018" } @@ -70,49 +56,57 @@ variable "node_pools" { enable_auto_scaling = bool min_count = number max_count = number + max_pods = number + os_disk_size_gb = number })) default = { compute = { enable_node_public_ip = false - vm_size = "Standard_DS4_v2" + vm_size = "Standard_D8s_v4" zones = ["1", "2", "3"] node_labels = { - "domino/build-node" = "true" - "dominodatalab.com/build-node" = "true" - "dominodatalab.com/node-pool" = "default" + "dominodatalab.com/node-pool" = "default" } node_os = "Linux" node_taints = [] enable_auto_scaling = true - min_count = 1 - max_count = 4 + min_count = 0 + max_count = 10 + max_pods = 30 + os_disk_size_gb = 128 + } + gpu = { + enable_node_public_ip = false + vm_size = "Standard_NC6s_v3" + zones = [] + node_labels = { + "dominodatalab.com/node-pool" = "default-gpu" + "nvidia.com/gpu" = "true" + } + node_os = "Linux" + node_taints = [ + "nvidia.com/gpu=true:NoExecute" + ] + enable_auto_scaling = true + min_count = 0 + max_count = 1 + max_pods = 30 + os_disk_size_gb = 128 } - # Example GPU Configuration - # gpu = { - # vm_size = "Standard_DS3_v2" - # zones = ["1", "2", "3"] - # node_labels = { - # "dominodatalab.com/node-pool" = "default-gpu" - # "nvidia.com/gpu" = "true" - # } - # node_os = "Linux" - # node_taints = [ - # "nvidia.com/gpu=true" - # ] - # enable_auto_scaling = true - # min_count = 1 - # max_count = 1 - # } platform = { enable_node_public_ip = false - vm_size = "Standard_DS5_v2" + vm_size = "Standard_D8s_v4" zones = ["1", "2", "3"] - node_labels = {} - node_os = "Linux" - node_taints = [] - enable_auto_scaling = true - min_count = 1 - max_count = 4 + node_labels = { + "dominodatalab.com/node-pool" = "platform" + } + node_os = "Linux" + node_taints = [] + enable_auto_scaling = true + min_count = 1 + max_count = 3 + max_pods = 60 + os_disk_size_gb = 128 } } } @@ -138,3 +132,9 @@ variable "subscription_id" { description = "An existing Subscription ID to add the deployment" default = "" } + +variable "tags" { + type = map(string) + default = {} + description = "Tags to apply to resources" +}