Skip to content

Commit

Permalink
Merge pull request #13 from dominodatalab/steved/gpu-scale-from-zero
Browse files Browse the repository at this point in the history
add GPU node pool; update diagnostic configurations
  • Loading branch information
steved authored Feb 18, 2021
2 parents 25c1bdb + 4382fff commit fd43442
Show file tree
Hide file tree
Showing 7 changed files with 313 additions and 209 deletions.
10 changes: 6 additions & 4 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
docker:
- image: microsoft/azure-cli
environment:
TERRAFORM_VERSION: 0.12.24
TERRAFORM_VERSION: 0.12.30

steps:
- checkout
Expand All @@ -19,13 +19,12 @@ jobs:
name: Install Terraform
command: |
wget https://releases.hashicorp.com/terraform/${TERRAFORM_VERSION}/terraform_${TERRAFORM_VERSION}_linux_amd64.zip
unzip terraform_${TERRAFORM_VERSION}_linux_amd64.zip
unzip terraform_${TERRAFORM_VERSION}_linux_amd64.zip -d /usr/local/bin
- run:
name: Configure test cluster env
command: |
echo 'export TF_VAR_aks_tags=\{\"CIRCLE_BUILD_URL\"=\"${CIRCLE_BUILD_URL}\",\"CIRCLE_PR_NUMBER\"=\"${CIRCLE_PR_NUMBER}\",\"CIRCLE_PULL_REQUEST\"=\"${CIRCLE_PULL_REQUEST}\",\"CIRCLE_REPOSITORY_URL\"=\"${CIRCLE_REPOSITORY_URL}\"\}' >> $BASH_ENV
echo 'export TF_VAR_tags=\{\"CIRCLE_BUILD_URL\"=\"${CIRCLE_BUILD_URL}\",\"CIRCLE_PR_NUMBER\"=\"${CIRCLE_PR_NUMBER}\",\"CIRCLE_PULL_REQUEST\"=\"${CIRCLE_PULL_REQUEST}\",\"CIRCLE_REPOSITORY_URL\"=\"${CIRCLE_REPOSITORY_URL}\"\}' >> $BASH_ENV
echo 'export WORKSPACE=azure-aks-circleci-${CIRCLE_BUILD_NUM}' >> $BASH_ENV
- run:
Expand All @@ -40,6 +39,10 @@ jobs:
name: Terraform init
command: terraform init

- run:
name: Terraform validate
command: terraform validate

- run:
name: Terraform workspace create
command: terraform workspace new ${WORKSPACE}
Expand All @@ -61,4 +64,3 @@ jobs:
name: Terraform workspace delete
command: terraform workspace select default && terraform workspace delete ${WORKSPACE}
when: always

2 changes: 1 addition & 1 deletion .github/CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# CODEOWNERS
# https://help.github.com/en/github/creating-cloning-and-archiving-repositories/about-code-owners#codeowners-file-location

* @zs-ddl @secretions @steved
* @dominodatalab/platform
92 changes: 92 additions & 0 deletions aks.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
resource "azurerm_kubernetes_cluster" "aks" {
lifecycle {
ignore_changes = [
tags,
default_node_pool[0].node_count,
default_node_pool[0].max_count,
default_node_pool[0].tags,
# VM Size changes cause recreation of the entire cluster
default_node_pool[0].vm_size
]
}

name = local.cluster_name
location = local.resource_group.location
resource_group_name = local.resource_group.name
dns_prefix = local.cluster_name
private_cluster_enabled = false
sku_tier = var.cluster_sku_tier

api_server_authorized_ip_ranges = var.api_server_authorized_ip_ranges

default_node_pool {
enable_node_public_ip = var.node_pools.platform.enable_node_public_ip
name = "platform"
node_count = var.node_pools.platform.min_count
node_labels = var.node_pools.platform.node_labels
vm_size = var.node_pools.platform.vm_size
availability_zones = var.node_pools.platform.zones
os_disk_size_gb = var.node_pools.platform.os_disk_size_gb
node_taints = var.node_pools.platform.node_taints
enable_auto_scaling = var.node_pools.platform.enable_auto_scaling
min_count = var.node_pools.platform.min_count
max_count = var.node_pools.platform.max_count
max_pods = var.node_pools.platform.max_pods
tags = local.tags
}

identity {
type = "SystemAssigned"
}

addon_profile {
kube_dashboard {
enabled = false
}

oms_agent {
enabled = true
log_analytics_workspace_id = azurerm_log_analytics_workspace.logs.id
}
}

network_profile {
load_balancer_sku = "Standard"
network_plugin = "azure"
network_policy = "calico"
dns_service_ip = "100.97.0.10"
docker_bridge_cidr = "172.17.0.1/16"
service_cidr = "100.97.0.0/16"
}

tags = local.tags
}

resource "azurerm_kubernetes_cluster_node_pool" "aks" {
lifecycle {
ignore_changes = [node_count, max_count, tags]
}

for_each = {
# Create all node pools except for 'platform' because it is the AKS default
for key, value in var.node_pools :
key => value
if key != "platform"
}

enable_node_public_ip = each.value.enable_node_public_ip
kubernetes_cluster_id = azurerm_kubernetes_cluster.aks.id
name = each.key
node_count = each.value.min_count
vm_size = each.value.vm_size
availability_zones = each.value.zones
os_disk_size_gb = each.value.os_disk_size_gb
os_type = each.value.node_os
node_labels = each.value.node_labels
node_taints = each.value.node_taints
enable_auto_scaling = each.value.enable_auto_scaling
min_count = each.value.min_count
max_count = each.value.max_count
max_pods = each.value.max_pods
tags = local.tags
}
170 changes: 16 additions & 154 deletions main.tf
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
provider "azurerm" {
version = ">=2.7.0"
partner_id = "31912fbf-f6dd-5176-bffb-0a01e8ac71f2"
features {}
}

terraform {
required_providers {
azurerm = {
version = "~> 2.46"
}
}

backend "azurerm" {
resource_group_name = "dominoterraform"
storage_account_name = "dominoterraformstorage"
Expand All @@ -13,16 +13,20 @@ terraform {
}
}

provider "azurerm" {
partner_id = "31912fbf-f6dd-5176-bffb-0a01e8ac71f2"
features {}
}

locals {
cluster_name = var.cluster_name != null ? var.cluster_name : terraform.workspace
resource_group = var.resource_group_name != null ? data.azurerm_resource_group.k8s[0] : azurerm_resource_group.k8s[0]

# Terraform doesn't accept backslash escapes inside the replace function here for some reason,
# nor does it allow for single quotes. This does, somehow, work, but messes up syntax highlighting.
storage_account_name = var.storage_account_name != null ? var.storage_account_name : substr("${replace(local.cluster_name, "/[_-]/", "")}dominostorage", 0, 24)
}
#" this comment is to fix syntax highlighting
safe_storage_cluster_name = replace(local.cluster_name, "/[_-]/", "")
storage_account_name = var.storage_account_name != null ? var.storage_account_name : substr("${local.safe_storage_cluster_name}dominostorage", 0, 24)
tags = merge({ "Cluster" : local.cluster_name }, var.tags)
}
data "azurerm_resource_group" "k8s" {
count = var.resource_group_name != null ? 1 : 0
Expand All @@ -33,31 +37,7 @@ resource "azurerm_resource_group" "k8s" {
count = var.resource_group_name == null ? 1 : 0
name = local.cluster_name
location = var.location
}

resource "random_id" "log_analytics_workspace_name_suffix" {
byte_length = 8
}

resource "azurerm_log_analytics_workspace" "logs" {
# The WorkSpace name has to be unique across the whole of azure, not just the current subscription/tenant.
name = "${var.log_analytics_workspace_name}-${random_id.log_analytics_workspace_name_suffix.dec}"
location = var.log_analytics_workspace_location
resource_group_name = local.resource_group.name
sku = var.log_analytics_workspace_sku
}

resource "azurerm_log_analytics_solution" "logs" {
solution_name = "ContainerInsights"
location = azurerm_log_analytics_workspace.logs.location
resource_group_name = local.resource_group.name
workspace_resource_id = azurerm_log_analytics_workspace.logs.id
workspace_name = azurerm_log_analytics_workspace.logs.name

plan {
publisher = "Microsoft"
product = "OMSGallery/ContainerInsights"
}
tags = local.tags
}

data "azurerm_subscription" "current" {
Expand All @@ -69,121 +49,3 @@ resource "azurerm_role_assignment" "sp" {
role_definition_name = "Network Contributor"
principal_id = azurerm_kubernetes_cluster.aks.identity[0].principal_id
}

resource "azurerm_kubernetes_cluster" "aks" {
lifecycle {
ignore_changes = [
default_node_pool[0].node_count
]
}

name = local.cluster_name
enable_pod_security_policy = false
location = local.resource_group.location
resource_group_name = local.resource_group.name
dns_prefix = local.cluster_name
private_cluster_enabled = false

api_server_authorized_ip_ranges = var.api_server_authorized_ip_ranges

default_node_pool {
enable_node_public_ip = var.node_pools.platform.enable_node_public_ip
name = "platform"
node_count = var.node_pools.platform.min_count
node_labels = merge({ "dominodatalab.com/node-pool" : "platform" }, var.node_pools.platform.node_labels)
vm_size = var.node_pools.platform.vm_size
availability_zones = var.node_pools.platform.zones
max_pods = 250
os_disk_size_gb = 128
node_taints = var.node_pools.platform.node_taints
enable_auto_scaling = var.node_pools.platform.enable_auto_scaling
min_count = var.node_pools.platform.min_count
max_count = var.node_pools.platform.max_count
tags = {}
}

identity {
type = "SystemAssigned"
}

addon_profile {
kube_dashboard {
enabled = false
}

oms_agent {
enabled = true
log_analytics_workspace_id = azurerm_log_analytics_workspace.logs.id
}
}

network_profile {
load_balancer_sku = "Standard"
network_plugin = "azure"
network_policy = "calico"
dns_service_ip = "100.97.0.10"
docker_bridge_cidr = "172.17.0.1/16"
service_cidr = "100.97.0.0/16"
}

tags = merge({ Environment = "Development" }, var.aks_tags)

}

resource "azurerm_kubernetes_cluster_node_pool" "aks" {
lifecycle {
ignore_changes = [
node_count
]
}

for_each = {
# Create all node pools except for 'platform' because it is the AKS default
for key, value in var.node_pools :
key => value
if key != "platform"
}

enable_node_public_ip = each.value.enable_node_public_ip
kubernetes_cluster_id = azurerm_kubernetes_cluster.aks.id
name = each.key
node_count = each.value.min_count
vm_size = each.value.vm_size
availability_zones = each.value.zones
max_pods = 250
os_disk_size_gb = 128
os_type = each.value.node_os
node_labels = merge({ "dominodatalab.com/node-pool" : each.key }, each.value.node_labels)
node_taints = each.value.node_taints
enable_auto_scaling = each.value.enable_auto_scaling
min_count = each.value.min_count
max_count = each.value.max_count
tags = {}
}

resource "azurerm_storage_account" "domino" {
name = local.storage_account_name
resource_group_name = local.resource_group.name
location = local.resource_group.location
account_kind = "StorageV2"
account_tier = var.storage_account_tier
account_replication_type = var.storage_account_replication_type
access_tier = "Hot"
}

resource "azurerm_storage_container" "domino_containers" {
for_each = {
for key, value in var.containers :
key => value
}

name = substr("${local.cluster_name}-${each.key}", 0, 63)
storage_account_name = azurerm_storage_account.domino.name
container_access_type = each.value.container_access_type

lifecycle {
ignore_changes = [
name
]
}
}
Loading

0 comments on commit fd43442

Please sign in to comment.