Merge pull request #39 from oracle-quickstart/2.10.6

2.10.6
oracle-quickstart · May 17, 2024 · 382c496 · 382c496
2 parents 3c6f243 + a89a1b2
commit 382c496
Show file tree

Hide file tree

Showing 42 changed files with 1,566 additions and 276 deletions.
diff --git a/README.md b/README.md
@@ -104,12 +104,12 @@ optional arguments:
   --ansible_crucial     If present during reconfiguration, only crucial
                         ansible playbooks will be executed on the live nodes.
                         Non live nodes will be removed
-  --remove_unreachable  If present, nodes that are not sshable will be removed
-                        from the config. They will however not be removed from
-                        Slurm to avoid losing track of the down nodes. If you
-                        need to remove them from Slurm after terminating the
-                        nodes in the console. Run sudo scontrol update
-                        nodename=name state=Future 
+  --remove_unreachable  If present, nodes that are not sshable will be terminated 
+                        before running the action that was requested
+                        (Example Adding a node)
+  --quiet               If present, the script will not prompt for a response when 
+                        removing nodes and will not give a reminder to save data 
+                        from nodes that are being removed
 ```
 
 **Add nodes** 
@@ -161,6 +161,13 @@ Remove 3 nodes randomly from compute-1-hpc:
 ```
 /opt/oci-hpc/bin/resize.sh remove 3 --cluster_name compute-1-hpc
 
+```
+or 
+Remove 3 nodes randomly from compute-1-hpc but do not prompt for a response when removing the nodes and do not give a reminder to save data 
+from nodes that are being removed :  
+```
+/opt/oci-hpc/bin/resize.sh remove 3 --cluster_name compute-1-hpc --quiet
+
 ```
 
 **Reconfigure nodes** 
@@ -208,6 +215,10 @@ Uncomment the line in `crontab -e`:
 ```
 * * * * * /opt/oci-hpc/autoscaling/crontab/autoscale_slurm.sh >> /opt/oci-hpc/logs/crontab_slurm.log 2>&1
 ```
+And in /etc/ansible/hosts, below value should be true
+```
+autoscaling = true
+```
 
 # Submit
 How to submit jobs: 

diff --git a/autoscaling/crontab/autoscale_slurm.sh b/autoscaling/crontab/autoscale_slurm.sh
@@ -91,8 +91,8 @@ def getIdleTime(node):
     return ( datetime.datetime.now() - right_time ).total_seconds()
 
 # Get the last time a node state was changed. This is used to get how long a cluster has been idle for
-def getQueueConf(file):
-    with open(queues_conf_file) as file:
+def getQueueConf(queue_file):
+    with open(queue_file) as file:
         try:
             data = yaml.load(file,Loader=yaml.FullLoader)
         except:
@@ -328,109 +328,125 @@ def getstatus_slurm():
             cluster_destroying.append(clusterName)
     return cluster_to_build,cluster_to_destroy,nodes_to_destroy,cluster_building,cluster_destroying,used_index,current_nodes,building_nodes
 
-if os.path.isfile(lockfile):
-    print( "Lockfile "+lockfile + " is present, exiting" )
-    exit()
-open(lockfile,'w').close()
-try:
-    path = os.path.dirname(os.path.dirname(os.path.realpath(sys.argv[0])))
-    clusters_path = os.path.join(path,'clusters')
-    config = getQueueConf(queues_conf_file)
-
-    cluster_to_build,cluster_to_destroy,nodes_to_destroy,cluster_building,cluster_destroying,used_index,current_nodes,building_nodes=getstatus_slurm()
-
-    print (time.strftime("%Y-%m-%d %H:%M:%S"))
-    print (cluster_to_build,'cluster_to_build')
-    print (cluster_to_destroy,'cluster_to_destroy')
-    print (nodes_to_destroy,'nodes_to_destroy')
-    print (cluster_building,'cluster_building')
-    print (cluster_destroying,'cluster_destroying')
-    print (current_nodes,'current_nodes')
-    print (building_nodes,'building_nodes')
-
-    for i in cluster_building:
-        for j in cluster_to_build:
-            if i[0]==j[0] and i[1]==j[1] and i[2]==j[2]:
-                cluster_to_build.remove(j)
-                break
-    for cluster in cluster_to_destroy:
-        cluster_name=cluster[0]
-        print ("Deleting cluster "+cluster_name)
-        subprocess.Popen([script_path+'/delete_cluster.sh',cluster_name])
-        time.sleep(5)
-
-    for cluster_name in nodes_to_destroy.keys():
-        print ("Resizing cluster "+cluster_name)
-        initial_nodes=[]
-        unreachable_nodes=[]
-        if cluster_name == "NOCLUSTERFOUND":
-            subprocess.Popen([script_path+'/resize.sh','remove_unreachable','--nodes']+nodes_to_destroy[cluster_name])
-            continue
-        for node in nodes_to_destroy[cluster_name]:
+def getAutoscaling():
+    out = subprocess.Popen(["cat /etc/ansible/hosts | grep 'autoscaling =' | awk -F  '= ' '{print $2}'"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True)
+    stdout,stderr = out.communicate()
+    output = stdout.split("\n")
+    autoscaling_value=False
+    for i in range(0,len(output)-1):
+        autoscaling_value=output[i]
+    return autoscaling_value
+
+autoscaling = getAutoscaling()
+
+if autoscaling == "true":
+
+    if os.path.isfile(lockfile):
+        print( "Lockfile "+lockfile + " is present, exiting" )
+        exit()
+    open(lockfile,'w').close()
+    try:
+        path = os.path.dirname(os.path.dirname(os.path.realpath(sys.argv[0])))
+        clusters_path = os.path.join(path,'clusters')
+        config = getQueueConf(queues_conf_file)
+
+        cluster_to_build,cluster_to_destroy,nodes_to_destroy,cluster_building,cluster_destroying,used_index,current_nodes,building_nodes=getstatus_slurm()
+
+        print (time.strftime("%Y-%m-%d %H:%M:%S"))
+        print (cluster_to_build,'cluster_to_build')
+        print (cluster_to_destroy,'cluster_to_destroy')
+        print (nodes_to_destroy,'nodes_to_destroy')
+        print (cluster_building,'cluster_building')
+        print (cluster_destroying,'cluster_destroying')
+        print (current_nodes,'current_nodes')
+        print (building_nodes,'building_nodes')
+
+        for i in cluster_building:
+            for j in cluster_to_build:
+                if i[0]==j[0] and i[1]==j[1] and i[2]==j[2]:
+                    cluster_to_build.remove(j)
+                    break
+        for cluster in cluster_to_destroy:
+            cluster_name=cluster[0]
+            print ("Deleting cluster "+cluster_name)
+            subprocess.Popen([script_path+'/delete_cluster.sh',cluster_name])
+            time.sleep(5)
+
+        for cluster_name in nodes_to_destroy.keys():
+            print ("Resizing cluster "+cluster_name)
+            initial_nodes=[]
+            unreachable_nodes=[]
+            if cluster_name == "NOCLUSTERFOUND":
+                subprocess.Popen([script_path+'/resize.sh','remove_unreachable','--nodes']+nodes_to_destroy[cluster_name],'--quiet')
+                continue
+            for node in nodes_to_destroy[cluster_name]:
+                try:
+                    alt_names=subprocess.check_output(["cat /etc/hosts | grep "+node],shell=True,universal_newlines=True)
+                    for alt_name in alt_names.split("\n")[0].split():
+                        if alt_name.startswith('inst-'):
+                            initial_nodes.append(alt_name)
+                            break
+                except:
+                    unreachable_nodes.append(node)    
+            if len(initial_nodes) > 0:
+                subprocess.Popen([script_path+'/resize.sh','--force','--cluster_name',cluster_name,'remove','--remove_unreachable','--nodes']+initial_nodes,'--quiet')
+            if len(unreachable_nodes) > 0:
+                subprocess.Popen([script_path+'/resize.sh','--cluster_name',cluster_name,'remove_unreachable','--nodes']+unreachable_nodes,'--quiet')
+            time.sleep(1)
+
+        for index,cluster in enumerate(cluster_to_build):
+            nodes=cluster[0]
+            instance_type = cluster[1]
+            queue=cluster[2]
+            jobID=str(cluster[3])
+            user=str(cluster[4])
+            jobconfig=getJobConfig(config,queue,instance_type)
+            limits=getQueueLimits(config,queue,instance_type)
             try:
-                alt_names=subprocess.check_output(["cat /etc/hosts | grep "+node],shell=True,universal_newlines=True)
-                for alt_name in alt_names.split("\n")[0].split():
-                    if alt_name.startswith('inst-'):
-                        initial_nodes.append(alt_name)
-                        break
+                clusterCount=len(used_index[queue][instance_type])
             except:
-                unreachable_nodes.append(node)    
-        if len(initial_nodes) > 0:
-            subprocess.Popen([script_path+'/resize.sh','--force','--cluster_name',cluster_name,'remove','--remove_unreachable','--nodes']+initial_nodes)
-        if len(unreachable_nodes) > 0:
-            subprocess.Popen([script_path+'/resize.sh','--cluster_name',cluster_name,'remove_unreachable','--nodes']+unreachable_nodes)
-        time.sleep(1)
-
-    for index,cluster in enumerate(cluster_to_build):
-        nodes=cluster[0]
-        instance_type = cluster[1]
-        queue=cluster[2]
-        jobID=str(cluster[3])
-        user=str(cluster[4])
-        jobconfig=getJobConfig(config,queue,instance_type)
-        limits=getQueueLimits(config,queue,instance_type)
-        try:
-            clusterCount=len(used_index[queue][instance_type])
-        except:
-            clusterCount=0
-        if clusterCount>=limits["max_cluster_count"]:
-            print ("This would go over the number of running clusters, you have reached the max number of clusters")
-            continue
-        nextIndex=None
-        if clusterCount==0:
-            if queue in used_index.keys():
-                used_index[queue][instance_type]=[1]
+                clusterCount=0
+            if clusterCount>=limits["max_cluster_count"]:
+                print ("This would go over the number of running clusters, you have reached the max number of clusters")
+                continue
+            nextIndex=None
+            if clusterCount==0:
+                if queue in used_index.keys():
+                    used_index[queue][instance_type]=[1]
+                else:
+                    used_index[queue]={instance_type:[1]}
+                nextIndex=1
             else:
-                used_index[queue]={instance_type:[1]}
-            nextIndex=1
-        else:
-            for i in range(1,10000):
-                if not i in used_index[queue][instance_type]:
-                    nextIndex=i
-                    used_index[queue][instance_type].append(i)
-                    break
-        clusterName=queue+'-'+str(nextIndex)+'-'+jobconfig["instance_keyword"]
-        if not queue in current_nodes.keys():
-            current_nodes[queue]={instance_type:0}
-        else:
-            if not instance_type in current_nodes[queue].keys():
-                current_nodes[queue][instance_type]=0
-        if not queue in building_nodes.keys():
-            building_nodes[queue]={instance_type:0}
-        else:
-            if not instance_type in building_nodes[queue].keys():
-                building_nodes[queue][instance_type]=0
-        if nodes > limits["max_cluster_size"]:
-            print ("Cluster "+clusterName+" won't be created, it would go over the total number of nodes per cluster limit")
-        elif current_nodes[queue][instance_type] + building_nodes[queue][instance_type] + nodes > limits["max_number_nodes"]:
-            print ("Cluster "+clusterName+" won't be created, it would go over the total number of nodes limit")
-        else:
-            current_nodes[queue][instance_type]+=nodes
-            clusterCount+=1
-            print ("Creating cluster "+clusterName+" with "+str(nodes)+" nodes")
-            subprocess.Popen([script_path+'/create_cluster.sh',str(nodes),clusterName,instance_type,queue,jobID,user])
-            time.sleep(5)
+                for i in range(1,10000):
+                    if not i in used_index[queue][instance_type]:
+                        nextIndex=i
+                        used_index[queue][instance_type].append(i)
+                        break
+            clusterName=queue+'-'+str(nextIndex)+'-'+jobconfig["instance_keyword"]
+            if not queue in current_nodes.keys():
+                current_nodes[queue]={instance_type:0}
+            else:
+                if not instance_type in current_nodes[queue].keys():
+                    current_nodes[queue][instance_type]=0
+            if not queue in building_nodes.keys():
+                building_nodes[queue]={instance_type:0}
+            else:
+                if not instance_type in building_nodes[queue].keys():
+                    building_nodes[queue][instance_type]=0
+            if nodes > limits["max_cluster_size"]:
+                print ("Cluster "+clusterName+" won't be created, it would go over the total number of nodes per cluster limit")
+            elif current_nodes[queue][instance_type] + building_nodes[queue][instance_type] + nodes > limits["max_number_nodes"]:
+                print ("Cluster "+clusterName+" won't be created, it would go over the total number of nodes limit")
+            else:
+                current_nodes[queue][instance_type]+=nodes
+                clusterCount+=1
+                print ("Creating cluster "+clusterName+" with "+str(nodes)+" nodes")
+                subprocess.Popen([script_path+'/create_cluster.sh',str(nodes),clusterName,instance_type,queue,jobID,user])
+                time.sleep(5)
 
-except Exception:
-    traceback.print_exc()
-os.remove(lockfile)
+    except Exception:
+        traceback.print_exc()
+    os.remove(lockfile)
+else:
+    print("Autoscaling is false")
+    exit()
diff --git a/autoscaling/tf_init/cluster-network-configuration.tf b/autoscaling/tf_init/cluster-network-configuration.tf
@@ -41,7 +41,13 @@ resource "oci_core_instance_configuration" "cluster-network-instance_configurati
           name = "Compute HPC RDMA Auto-Configuration"
           desired_state = plugins_config.value
           }
-
+        }
+        dynamic plugins_config {
+          for_each = length(regexall(".*GPU.*", var.cluster_network_shape)) > 0 ? ["ENABLED"] : ["DISABLED"]
+          content {
+          name = "Compute RDMA GPU Monitoring"
+          desired_state = plugins_config.value
+          }
         }
       }
       dynamic "platform_config" {

diff --git a/autoscaling/tf_init/controller_update.tf b/autoscaling/tf_init/controller_update.tf
@@ -77,7 +77,8 @@ resource "local_file" "inventory" {
     compute_username = var.compute_username,
     pam = var.pam,
     sacct_limits = var.sacct_limits,
-    use_compute_agent=var.use_compute_agent
+    use_compute_agent=var.use_compute_agent,
+    healthchecks=var.healthchecks
     })
   filename   = "${local.controller_path}/inventory"
 }

diff --git a/autoscaling/tf_init/instance-pool-configuration.tf b/autoscaling/tf_init/instance-pool-configuration.tf
@@ -18,8 +18,23 @@ resource "oci_core_instance_configuration" "instance_pool_configuration" {
         user_data           = base64encode(data.template_file.config.rendered)
       }
       agent_config {
-        is_management_disabled = true
+
+        are_all_plugins_disabled = false
+        is_management_disabled   = true
+        is_monitoring_disabled   = false
+
+        plugins_config {
+          desired_state = "DISABLED"
+          name          = "OS Management Service Agent"
+          }
+        dynamic plugins_config {
+          for_each = length(regexall(".*GPU.*", var.instance_pool_shape)) > 0 ? ["ENABLED"] : ["DISABLED"]
+          content {
+          name = "Compute RDMA GPU Monitoring"
+          desired_state = plugins_config.value
+          }
         }
+      }
       shape = var.instance_pool_shape
 
       dynamic "shape_config" {

diff --git a/autoscaling/tf_init/inventory.tpl b/autoscaling/tf_init/inventory.tpl
@@ -73,4 +73,5 @@ pam = ${pam}
 sacct_limits=${sacct_limits}
 use_compute_agent=${use_compute_agent}
 zone_name=${zone_name}
-dns_entries=${dns_entries}
+dns_entries=${dns_entries}
+healthchecks=${healthchecks}
diff --git a/autoscaling/tf_init/versions.tf b/autoscaling/tf_init/versions.tf
@@ -3,7 +3,7 @@ terraform {
   required_providers {
      oci = {
          source = "oracle/oci"
-         version = "5.30.0"
+         version = "5.37.0"
      }
   }
 }
diff --git a/bin/delete_cluster.sh b/bin/delete_cluster.sh
@@ -103,7 +103,7 @@ else
         for node in `scontrol show hostname $nodes 2>&1`
         do
             echo "Cleaning up node " $node
-            /opt/oci-hpc/bin/resize.sh remove_unreachable --nodes $node 
+            /opt/oci-hpc/bin/resize.sh remove_unreachable --nodes $node --quiet
         done
     fi
     cd

diff --git a/bin/remove_nodes_prompt.txt b/bin/remove_nodes_prompt.txt
@@ -0,0 +1,4 @@
+Does your cluster run any file system like Ceph, NFS, etc. on the GPU/HPC nodes itself using local NVMe SSDs?  
+If yes, terminating nodes which store your data can result in permanent data loss, so before proceeding make sure any important data is copied to a persistent file system outside of the cluster such as to object storage, file storage, etc.
+Once data is backed up or migrated, come back and run the script. Select 2 to exit.
+Remember, once the nodes are terminated, all the data is lost forever and you won't be able to recover it.
diff --git a/bin/resize.py b/bin/resize.py
@@ -447,7 +447,10 @@ def getNFSnode(inventory):
         return ''
     if len(dict['nfs']) == 0:
         return ''
-    return dict['nfs'][0].split()[0]
+    if dict['nfs'][0] == '\n':
+        return ''
+    else:
+        return dict['nfs'][0].split()[0]
 
 def get_summary(comp_ocid,cluster_name):
     CN = "CN"
@@ -577,6 +580,7 @@ def getLaunchInstanceDetails(instance,comp_ocid,cn_ocid,max_previous_index,index
 parser.add_argument('--force', help='If present. Nodes will be removed even if the destroy playbook failed',action='store_true',default=False)
 parser.add_argument('--ansible_crucial', help='If present during reconfiguration, only crucial ansible playbooks will be executed on the live nodes. Non live nodes will be removed',action='store_true',default=False)
 parser.add_argument('--remove_unreachable', help='If present, nodes that are not sshable will be terminated before running the action that was requested (Example Adding a node) ',action='store_true',default=False)
+parser.add_argument('--quiet', help='If present, the script will not prompt for a response when removing nodes and will not give a reminder to save data from nodes that are being removed ',action='store_true',default=False)
 
 args = parser.parse_args()
 
@@ -753,6 +757,7 @@ def getLaunchInstanceDetails(instance,comp_ocid,cn_ocid,max_previous_index,index
         if len(unreachable_instances):
             if not remove_unreachable:
                 print("STDOUT: At least one unreachable node is in the inventory")
+                print(unreachable_instances)
                 print("STDOUT: Not doing anything")
                 exit(1)
             else: