Merge pull request #44 from oracle-quickstart/2.11.0

2.11.0 Release
oracle-quickstart · Oct 16, 2024 · 6c9652e · 6c9652e
2 parents 382c496 + e5ee24e
commit 6c9652e
Show file tree

Hide file tree

Showing 158 changed files with 45,035 additions and 2,601 deletions.
diff --git a/README.md b/README.md
@@ -23,6 +23,7 @@ Allow dynamic-group instance_principal to manage compute-management-family in co
 Allow dynamic-group instance_principal to manage instance-family in compartment compartmentName
 Allow dynamic-group instance_principal to use virtual-network-family in compartment compartmentName
 Allow dynamic-group instance_principal to use volumes in compartment compartmentName
+Allow dynamic-group instance_principal to manage dns in compartment compartmentName
 ```
 or:
 
@@ -34,12 +35,9 @@ The stack allowa various combination of OS. Here is a list of what has been test
 
 |   Controller  |    Compute   |
 |---------------|--------------|
-|      OL7      |      OL7     |  
-|      OL7      |      OL8     |
-|      OL7      |    CentOS7   |
-|      OL8      |       OL8    |
-|      OL8      |       OL7    |
-| Ubuntu  20.04 | Ubuntu 20.04 |
+|      OL8      |      OL8     |
+|      OL8      |      OL7     |
+| Ubuntu  22.04 | Ubuntu 22.04 |
 
 When switching to Ubuntu, make sure the username is changed from opc to Ubuntu in the ORM for both the controller and compute nodes. 
 ## How is resizing different from autoscaling ?
@@ -276,10 +274,6 @@ Example:
 ```
 /opt/oci-hpc/bin/create_cluster.sh 4 compute2-1-hpc HPC_instance compute2
 ```
-The name of the cluster must be
-queueName-clusterNumber-instanceType_keyword
-
-The keyword will need to match the one from /opt/oci-hpc/conf/queues.conf to be registered in Slurm
 
 ### Cluster Deletion: 
 ```
@@ -422,3 +416,14 @@ By default, this check box is enabled. By selecting, this check-box, a PAR would
 Step 2: Use shell script: upload_rdma_nic_metrics.sh to collect metrics and upload to object storage.
 User needs to use shell script: upload_rdma_nic_metrics.sh to collect metrics and upload to object storage. User could configure metrics
 collection limit and interval through config file: rdma_metrics_collection_config.conf.
+
+## Meshpinger
+
+Meshpinger is a tool for validating network layer connectivity between RDMA NICs on a cluster network in OCI. The tool is capable of initiating ICMP ping from every RDMA NIC port on the cluster network to every other RDMA NIC port on the same cluster network and
+reporting back the success/failure status of the pings performed in the form of logs
+
+Running the tool before starting workload on a cluster network should serve as a good precheck step to gain confidence on the network reachability between RDMA NICs. Typical causes for reachability failures that the tool can help pinpoint are,
+1. Link down on the RDMA NIC
+2. RDMA interface initialization or configuration issues including IP address assignment to
+the interface
+3. Insufficient ARP table size on the node to store all needed peer mac addresses
diff --git a/autoscaling/crontab/autoscale_slurm.sh b/autoscaling/crontab/autoscale_slurm.sh
@@ -111,12 +111,12 @@ def getDefaultsConfig(config,queue_name):
             for instance_type in partition["instance_types"]:
                 if "default" in instance_type.keys():
                     if instance_type["default"]:
-                        return {"queue":partition["name"], "instance_type":instance_type["name"], "shape":instance_type["shape"], "cluster_network":instance_type["cluster_network"], "instance_keyword":instance_type["instance_keyword"]}
+                        return {"queue":partition["name"], "instance_type":instance_type["name"], "shape":instance_type["shape"], "cluster_network":instance_type["cluster_network"], "hostname_convention":instance_type["hostname_convention"]}
             if len(partition["instance_types"])>0:
                 instance_type=partition["instance_types"][0]
                 print ("No default configuration was found, there may be a problem in your queues.conf file")
                 print ("Selecting "+instance_type["name"]+" as default")
-                return {"queue":partition["name"], "instance_type":instance_type["name"], "shape":instance_type["shape"], "cluster_network":instance_type["cluster_network"], "instance_keyword":instance_type["instance_keyword"]}
+                return {"queue":partition["name"], "instance_type":instance_type["name"], "shape":instance_type["shape"], "cluster_network":instance_type["cluster_network"], "hostname_convention":instance_type["hostname_convention"]}
     print ("The queue "+queue_name+" was not found in the queues.conf file")
     return None
 
@@ -125,7 +125,7 @@ def getJobConfig(config,queue_name,instance_type_name):
         if queue_name == partition["name"]:
             for instance_type in partition["instance_types"]:
                 if instance_type_name == instance_type["name"]:
-                    return {"queue":partition["name"], "instance_type":instance_type["name"], "shape":instance_type["shape"], "cluster_network":instance_type["cluster_network"], "instance_keyword":instance_type["instance_keyword"]}
+                    return {"queue":partition["name"], "instance_type":instance_type["name"], "shape":instance_type["shape"], "cluster_network":instance_type["cluster_network"], "hostname_convention":instance_type["hostname_convention"]}
     return None
 
 def getQueueLimits(config,queue_name,instance_type_name):
@@ -136,11 +136,11 @@ def getQueueLimits(config,queue_name,instance_type_name):
                     return {"max_number_nodes": int(instance_type["max_number_nodes"]), "max_cluster_size": int(instance_type["max_cluster_size"]),"max_cluster_count": int(instance_type["max_cluster_count"])}
     return {"max_number_nodes": 0, "max_cluster_size": 0,"max_cluster_count": 0}
 
-def getInstanceType(config,queue_name,instance_keyword):
+def getInstanceType(config,queue_name,hostname_convention):
     for partition in config:
         if queue_name == partition["name"]:
             for instance_type in partition["instance_types"]:
-                if instance_keyword == instance_type["instance_keyword"]:
+                if hostname_convention == instance_type["hostname_convention"]:
                     return instance_type["name"]
     return None
 
@@ -161,26 +161,33 @@ def getAllClusterNames(config):
     return availableNames
 
 def getClusterName(node):
-    out = subprocess.Popen(['scontrol','show','topology',node], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True)
-    stdout,stderr = out.communicate()
-    clusterName = None
-    try:
-        if len(stdout.split('\n')) > 2:
-            for output in stdout.split('\n')[:-1]:
-                if "Switches=" in output:
-                    clusterName=output.split()[0].split('SwitchName=')[1]
-                    break
-                elif "SwitchName=inactive-" in output:
-                    continue
-                else:
-                    clusterName=output.split()[0].split('SwitchName=')[1]
-        elif len(stdout.split('\n')) == 2:
-            clusterName=stdout.split('\n')[0].split()[0].split('SwitchName=')[1]
-        if clusterName.startswith("inactive-"):
+    details=getNodeDetails(node)
+    clusterName="NOCLUSTERFOUND"
+    for feature in details[0].split(","):
+        if feature.startswith('CN__'):
+            clusterName=feature[4:]
+    if clusterName == "NOCLUSTERFOUND":
+        out = subprocess.Popen(['scontrol','show','topology',node], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True)
+        stdout,stderr = out.communicate()
+        clusterName = None
+        try:
+            if len(stdout.split('\n')) > 2:
+                for output in stdout.split('\n')[:-1]:
+                    if "Switches=" in output:
+                        clusterName=output.split()[0].split('SwitchName=')[1]
+                        break
+                    elif "SwitchName=inactive-" in output:
+                        continue
+                    else:
+                        clusterName=output.split()[0].split('SwitchName=')[1]
+            elif len(stdout.split('\n')) == 2:
+                clusterName=stdout.split('\n')[0].split()[0].split('SwitchName=')[1]
+            if clusterName.startswith("inactive-"):
+                return "NOCLUSTERFOUND"
+        except: 
+            print('No ClusterName could be found for '+node)
+            print('There seems to be some issues in the slurm topology file')
             return "NOCLUSTERFOUND"
-    except: 
-        print('No ClusterName could be found for '+node)
-        return "NOCLUSTERFOUND"
     return clusterName
 
 def getstatus_slurm():
@@ -246,7 +253,7 @@ def getstatus_slurm():
             clustername=getClusterName(node)
             if clustername is None:
                 continue
-            instanceType=features[-1]
+            instanceType=features[0]
             if queue in current_nodes.keys():
                 if instanceType in current_nodes[queue].keys():
                     current_nodes[queue][instanceType]+=1
@@ -276,7 +283,9 @@ def getstatus_slurm():
     cluster_to_destroy=[]
     for clustername in nodes_to_destroy_temp.keys():
         destroyEntireCluster=True
-        if clustername in running_cluster or clustername == "NOCLUSTERFOUND":
+        if clustername == "NOCLUSTERFOUND":
+            destroyEntireCluster=False
+        elif clustername in running_cluster:
             nodes_to_destroy[clustername]=nodes_to_destroy_temp[clustername]
             destroyEntireCluster=False
         else:
@@ -295,10 +304,10 @@ def getstatus_slurm():
     for clusterName in os.listdir(clusters_path):
         if len(clusterName.split('-')) < 3:
             continue
-        instance_keyword='-'.join(clusterName.split('-')[2:])
+        hostname_convention='-'.join(clusterName.split('-')[2:])
         clusterNumber=int(clusterName.split('-')[1])
         queue=clusterName.split('-')[0]
-        instanceType=getInstanceType(config,queue,instance_keyword)
+        instanceType=getInstanceType(config,queue,hostname_convention)
         if not queue in used_index.keys():
             used_index[queue]={}
         if not instanceType in used_index[queue].keys():
@@ -311,19 +320,19 @@ def getstatus_slurm():
                 nodes = line.split()[0]
                 instance_type = line.split()[1]
                 queue = line.split()[2]
-            try:
-                cluster_building.append([int(nodes),instance_type,queue])
-                if queue in building_nodes.keys():
-                    if instance_type in building_nodes[queue].keys():
-                        building_nodes[queue][instance_type]+=int(nodes)
+                try:
+                    cluster_building.append([int(nodes),instance_type,queue])
+                    if queue in building_nodes.keys():
+                        if instance_type in building_nodes[queue].keys():
+                            building_nodes[queue][instance_type]+=int(nodes)
+                        else:
+                            building_nodes[queue][instance_type]=int(nodes)
                     else:
-                        building_nodes[queue][instance_type]=int(nodes)
-                else:
-                    building_nodes[queue]={instance_type:int(nodes)}
-            except ValueError:
-                print ('The cluster '+ clusterName + ' does not have a valid entry for \"currently_building\"')
-                print ('Ignoring')
-                continue
+                        building_nodes[queue]={instance_type:int(nodes)}
+                except ValueError:
+                    print ('The cluster '+ clusterName + ' does not have a valid entry for \"currently_building\"')
+                    print ('Ignoring')
+                    continue
         if os.path.isfile(os.path.join(clusters_path,clusterName,'currently_destroying')):
             cluster_destroying.append(clusterName)
     return cluster_to_build,cluster_to_destroy,nodes_to_destroy,cluster_building,cluster_destroying,used_index,current_nodes,building_nodes
@@ -422,7 +431,7 @@ if autoscaling == "true":
                         nextIndex=i
                         used_index[queue][instance_type].append(i)
                         break
-            clusterName=queue+'-'+str(nextIndex)+'-'+jobconfig["instance_keyword"]
+            clusterName=queue+'-'+str(nextIndex)+'-'+jobconfig["hostname_convention"]
             if not queue in current_nodes.keys():
                 current_nodes[queue]={instance_type:0}
             else:
@@ -448,5 +457,5 @@ if autoscaling == "true":
         traceback.print_exc()
     os.remove(lockfile)
 else:
-    print("Autoscaling is false")
+    print("Autoscaling is false (set in /etc/ansible/hosts)")
     exit()
diff --git a/autoscaling/tf_init/cluster-network.tf b/autoscaling/tf_init/cluster-network.tf
@@ -28,6 +28,8 @@ resource "oci_core_cluster_network" "cluster_network" {
   }
   freeform_tags = {
       "user" = var.tags
+      "cluster_name" = local.cluster_name
+      "parent_cluster" = local.cluster_name
   }
   placement_configuration {
     availability_domain = var.ad

diff --git a/autoscaling/tf_init/compute-cluster.tf b/autoscaling/tf_init/compute-cluster.tf
@@ -7,6 +7,7 @@ resource "oci_core_compute_cluster" "compute_cluster" {
     #Optional
     display_name = local.cluster_name
     freeform_tags = {
+      "user" = var.tags
       "cluster_name" = local.cluster_name
       "parent_cluster" = local.cluster_name
   }

diff --git a/autoscaling/tf_init/controller_update.tf b/autoscaling/tf_init/controller_update.tf
@@ -1,6 +1,6 @@
 
 locals {
-  controller_path = "${var.autoscaling_folder}/clusters/${var.cluster_name}"
+  controller_path = "${var.autoscaling_folder}/clusters/${local.cluster_name}"
 }
 
 resource "null_resource" "create_path" {
@@ -12,7 +12,7 @@ resource "null_resource" "create_path" {
 resource "local_file" "hosts" {
     depends_on = [null_resource.create_path,oci_core_cluster_network.cluster_network]
     content     = join("\n", local.cluster_instances_ips)
-    filename = "${local.controller_path}/hosts_${var.cluster_name}"
+    filename = "${local.controller_path}/hosts_${local.cluster_name}"
   }
 
 resource "local_file" "inventory" {
@@ -24,6 +24,8 @@ resource "local_file" "inventory" {
     backup_ip = var.backup_ip,
     login_name = var.login_name,
     login_ip = var.login_ip,
+    monitoring_name = var.monitoring_name,
+    monitoring_ip = var.monitoring_ip,
     compute = var.node_count > 0 ? zipmap(local.cluster_instances_names, local.cluster_instances_ips) : zipmap([],[])
     public_subnet = var.public_subnet, 
     private_subnet = var.private_subnet, 
@@ -66,7 +68,7 @@ resource "local_file" "inventory" {
     instance_pool_ocpus=local.instance_pool_ocpus,
     queue=var.queue,
     instance_type=var.instance_type,
-    monitoring=var.monitoring,
+    cluster_monitoring=var.cluster_monitoring,
     autoscaling_monitoring = var.autoscaling_monitoring,
     unsupported = var.unsupported,
     hyperthreading = var.hyperthreading,
@@ -78,7 +80,9 @@ resource "local_file" "inventory" {
     pam = var.pam,
     sacct_limits = var.sacct_limits,
     use_compute_agent=var.use_compute_agent,
-    healthchecks=var.healthchecks
+    healthchecks=var.healthchecks,
+    change_hostname=var.change_hostname,
+    hostname_convention=var.hostname_convention
     })
   filename   = "${local.controller_path}/inventory"
 }

diff --git a/autoscaling/tf_init/instance-pool.tf b/autoscaling/tf_init/instance-pool.tf
@@ -27,6 +27,8 @@ resource "oci_core_instance_pool" "instance_pool" {
   display_name              = local.cluster_name
   freeform_tags = {
       "user" = var.tags
+      "cluster_name" = local.cluster_name
+      "parent_cluster" = local.cluster_name
   }
   placement_configurations {
     availability_domain = var.ad

diff --git a/autoscaling/tf_init/inventory.tpl b/autoscaling/tf_init/inventory.tpl
@@ -4,6 +4,8 @@ ${controller_name} ansible_host=${controller_ip} ansible_user=${controller_usern
 %{ if backup_name != "" }${backup_name} ansible_host=${backup_ip} ansible_user=${controller_username} role=controller%{ endif }
 [login]
 %{ if login_name != "" }${login_name} ansible_host=${login_ip} ansible_user=${compute_username} role=login%{ endif }
+[monitoring]
+%{ if monitoring_name != "" }${monitoring_name} ansible_host=${monitoring_ip} ansible_user=${compute_username} role=monitoring%{ endif }
 [compute_to_add]
 [compute_configured]
 %{ for host, ip in compute ~}
@@ -62,7 +64,7 @@ log_vol=${log_vol}
 ldap=${ldap}
 queue=${queue}
 instance_type=${instance_type}
-monitoring=${monitoring}
+cluster_monitoring=${cluster_monitoring}
 hyperthreading=${hyperthreading}
 privilege_sudo=${privilege_sudo}
 privilege_group_name=${privilege_group_name}
@@ -74,4 +76,6 @@ sacct_limits=${sacct_limits}
 use_compute_agent=${use_compute_agent}
 zone_name=${zone_name}
 dns_entries=${dns_entries}
-healthchecks=${healthchecks}
+healthchecks=${healthchecks}
+change_hostname=${change_hostname}
+hostname_convention=${hostname_convention}
diff --git a/autoscaling/tf_init/locals.tf b/autoscaling/tf_init/locals.tf
@@ -38,6 +38,6 @@ locals {
 
   timeout_per_batch= var.cluster_network ? var.use_multiple_ads ? 15 : 30 : var.use_multiple_ads ? 6 : 15
   timeout_ip = join("",[ (( var.node_count - ( var.node_count % 20 ) )/20 + 1 ) * local.timeout_per_batch,"m"])
-  platform_type = local.shape == "BM.GPU4.8" ? "AMD_ROME_BM_GPU" : local.shape == "BM.GPU.B4.8" || local.shape == "BM.GPU.H100.8" || local.shape == "BM.GPU.A100-v2.8" ? "AMD_MILAN_BM_GPU" : local.shape == "BM.Standard.E3.128" ? "AMD_ROME_BM" :  local.shape == "BM.Standard.E4.128" || local.shape == "BM.DenseIO.E4.128" ? "AMD_MILAN_BM" : "GENERIC_BM" 
+  platform_type = local.shape == "BM.GPU4.8" ? "AMD_ROME_BM_GPU" : local.shape == "BM.GPU.B4.8" || local.shape == "BM.GPU.A100-v2.8" ? "AMD_MILAN_BM_GPU" : local.shape == "BM.Standard.E3.128" ? "AMD_ROME_BM" :  local.shape == "BM.Standard.E4.128" || local.shape == "BM.DenseIO.E4.128" ? "AMD_MILAN_BM" : "GENERIC_BM" 
 
 }
diff --git a/autoscaling/tf_init/network.tf b/autoscaling/tf_init/network.tf
@@ -183,10 +183,10 @@ resource "oci_dns_rrset" "rrset-cluster-network-SLURM" {
 
   for_each        = var.slurm && var.dns_entries ? toset([for v in range(var.node_count) : tostring(v)]) : []
   zone_name_or_id = data.oci_dns_zones.dns_zones.zones[0].id
-  domain          = "${var.queue}-${var.instance_type}-${local.cluster_instances_ips_index[tonumber(each.key)]}.${var.zone_name}"
+  domain          = "${var.hostname_convention}-${local.cluster_instances_ips_index[tonumber(each.key)]}.${var.zone_name}"
   rtype           = "A"
   items {
-    domain = "${var.queue}-${var.instance_type}-${local.cluster_instances_ips_index[tonumber(each.key)]}.${var.zone_name}"
+    domain = "${var.hostname_convention}-${local.cluster_instances_ips_index[tonumber(each.key)]}.${var.zone_name}"
     rtype  = "A"
     rdata  = "${local.cluster_instances_ips[tonumber(each.key)]}"
     ttl    = 3600

diff --git a/autoscaling/tf_init/versions.tf b/autoscaling/tf_init/versions.tf
@@ -1,9 +1,9 @@
 terraform {
-  required_version = ">= 1.0"
+  required_version = ">= 1.2"
   required_providers {
      oci = {
          source = "oracle/oci"
-         version = "5.37.0"
+         version = "6.9.0"
      }
   }
 }