Skip to content

Commit

Permalink
Merge pull request #39 from oracle-quickstart/2.10.6
Browse files Browse the repository at this point in the history
2.10.6
  • Loading branch information
arnaudfroidmont authored May 17, 2024
2 parents 3c6f243 + a89a1b2 commit 382c496
Show file tree
Hide file tree
Showing 42 changed files with 1,566 additions and 276 deletions.
23 changes: 17 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,12 +104,12 @@ optional arguments:
--ansible_crucial If present during reconfiguration, only crucial
ansible playbooks will be executed on the live nodes.
Non live nodes will be removed
--remove_unreachable If present, nodes that are not sshable will be removed
from the config. They will however not be removed from
Slurm to avoid losing track of the down nodes. If you
need to remove them from Slurm after terminating the
nodes in the console. Run sudo scontrol update
nodename=name state=Future
--remove_unreachable If present, nodes that are not sshable will be terminated
before running the action that was requested
(Example Adding a node)
--quiet If present, the script will not prompt for a response when
removing nodes and will not give a reminder to save data
from nodes that are being removed
```

**Add nodes**
Expand Down Expand Up @@ -161,6 +161,13 @@ Remove 3 nodes randomly from compute-1-hpc:
```
/opt/oci-hpc/bin/resize.sh remove 3 --cluster_name compute-1-hpc
```
or
Remove 3 nodes randomly from compute-1-hpc but do not prompt for a response when removing the nodes and do not give a reminder to save data
from nodes that are being removed :
```
/opt/oci-hpc/bin/resize.sh remove 3 --cluster_name compute-1-hpc --quiet
```

**Reconfigure nodes**
Expand Down Expand Up @@ -208,6 +215,10 @@ Uncomment the line in `crontab -e`:
```
* * * * * /opt/oci-hpc/autoscaling/crontab/autoscale_slurm.sh >> /opt/oci-hpc/logs/crontab_slurm.log 2>&1
```
And in /etc/ansible/hosts, below value should be true
```
autoscaling = true
```

# Submit
How to submit jobs:
Expand Down
224 changes: 120 additions & 104 deletions autoscaling/crontab/autoscale_slurm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,8 @@ def getIdleTime(node):
return ( datetime.datetime.now() - right_time ).total_seconds()

# Get the last time a node state was changed. This is used to get how long a cluster has been idle for
def getQueueConf(file):
with open(queues_conf_file) as file:
def getQueueConf(queue_file):
with open(queue_file) as file:
try:
data = yaml.load(file,Loader=yaml.FullLoader)
except:
Expand Down Expand Up @@ -328,109 +328,125 @@ def getstatus_slurm():
cluster_destroying.append(clusterName)
return cluster_to_build,cluster_to_destroy,nodes_to_destroy,cluster_building,cluster_destroying,used_index,current_nodes,building_nodes

if os.path.isfile(lockfile):
print( "Lockfile "+lockfile + " is present, exiting" )
exit()
open(lockfile,'w').close()
try:
path = os.path.dirname(os.path.dirname(os.path.realpath(sys.argv[0])))
clusters_path = os.path.join(path,'clusters')
config = getQueueConf(queues_conf_file)

cluster_to_build,cluster_to_destroy,nodes_to_destroy,cluster_building,cluster_destroying,used_index,current_nodes,building_nodes=getstatus_slurm()

print (time.strftime("%Y-%m-%d %H:%M:%S"))
print (cluster_to_build,'cluster_to_build')
print (cluster_to_destroy,'cluster_to_destroy')
print (nodes_to_destroy,'nodes_to_destroy')
print (cluster_building,'cluster_building')
print (cluster_destroying,'cluster_destroying')
print (current_nodes,'current_nodes')
print (building_nodes,'building_nodes')

for i in cluster_building:
for j in cluster_to_build:
if i[0]==j[0] and i[1]==j[1] and i[2]==j[2]:
cluster_to_build.remove(j)
break
for cluster in cluster_to_destroy:
cluster_name=cluster[0]
print ("Deleting cluster "+cluster_name)
subprocess.Popen([script_path+'/delete_cluster.sh',cluster_name])
time.sleep(5)

for cluster_name in nodes_to_destroy.keys():
print ("Resizing cluster "+cluster_name)
initial_nodes=[]
unreachable_nodes=[]
if cluster_name == "NOCLUSTERFOUND":
subprocess.Popen([script_path+'/resize.sh','remove_unreachable','--nodes']+nodes_to_destroy[cluster_name])
continue
for node in nodes_to_destroy[cluster_name]:
def getAutoscaling():
out = subprocess.Popen(["cat /etc/ansible/hosts | grep 'autoscaling =' | awk -F '= ' '{print $2}'"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True)
stdout,stderr = out.communicate()
output = stdout.split("\n")
autoscaling_value=False
for i in range(0,len(output)-1):
autoscaling_value=output[i]
return autoscaling_value

autoscaling = getAutoscaling()

if autoscaling == "true":

if os.path.isfile(lockfile):
print( "Lockfile "+lockfile + " is present, exiting" )
exit()
open(lockfile,'w').close()
try:
path = os.path.dirname(os.path.dirname(os.path.realpath(sys.argv[0])))
clusters_path = os.path.join(path,'clusters')
config = getQueueConf(queues_conf_file)

cluster_to_build,cluster_to_destroy,nodes_to_destroy,cluster_building,cluster_destroying,used_index,current_nodes,building_nodes=getstatus_slurm()

print (time.strftime("%Y-%m-%d %H:%M:%S"))
print (cluster_to_build,'cluster_to_build')
print (cluster_to_destroy,'cluster_to_destroy')
print (nodes_to_destroy,'nodes_to_destroy')
print (cluster_building,'cluster_building')
print (cluster_destroying,'cluster_destroying')
print (current_nodes,'current_nodes')
print (building_nodes,'building_nodes')

for i in cluster_building:
for j in cluster_to_build:
if i[0]==j[0] and i[1]==j[1] and i[2]==j[2]:
cluster_to_build.remove(j)
break
for cluster in cluster_to_destroy:
cluster_name=cluster[0]
print ("Deleting cluster "+cluster_name)
subprocess.Popen([script_path+'/delete_cluster.sh',cluster_name])
time.sleep(5)

for cluster_name in nodes_to_destroy.keys():
print ("Resizing cluster "+cluster_name)
initial_nodes=[]
unreachable_nodes=[]
if cluster_name == "NOCLUSTERFOUND":
subprocess.Popen([script_path+'/resize.sh','remove_unreachable','--nodes']+nodes_to_destroy[cluster_name],'--quiet')
continue
for node in nodes_to_destroy[cluster_name]:
try:
alt_names=subprocess.check_output(["cat /etc/hosts | grep "+node],shell=True,universal_newlines=True)
for alt_name in alt_names.split("\n")[0].split():
if alt_name.startswith('inst-'):
initial_nodes.append(alt_name)
break
except:
unreachable_nodes.append(node)
if len(initial_nodes) > 0:
subprocess.Popen([script_path+'/resize.sh','--force','--cluster_name',cluster_name,'remove','--remove_unreachable','--nodes']+initial_nodes,'--quiet')
if len(unreachable_nodes) > 0:
subprocess.Popen([script_path+'/resize.sh','--cluster_name',cluster_name,'remove_unreachable','--nodes']+unreachable_nodes,'--quiet')
time.sleep(1)

for index,cluster in enumerate(cluster_to_build):
nodes=cluster[0]
instance_type = cluster[1]
queue=cluster[2]
jobID=str(cluster[3])
user=str(cluster[4])
jobconfig=getJobConfig(config,queue,instance_type)
limits=getQueueLimits(config,queue,instance_type)
try:
alt_names=subprocess.check_output(["cat /etc/hosts | grep "+node],shell=True,universal_newlines=True)
for alt_name in alt_names.split("\n")[0].split():
if alt_name.startswith('inst-'):
initial_nodes.append(alt_name)
break
clusterCount=len(used_index[queue][instance_type])
except:
unreachable_nodes.append(node)
if len(initial_nodes) > 0:
subprocess.Popen([script_path+'/resize.sh','--force','--cluster_name',cluster_name,'remove','--remove_unreachable','--nodes']+initial_nodes)
if len(unreachable_nodes) > 0:
subprocess.Popen([script_path+'/resize.sh','--cluster_name',cluster_name,'remove_unreachable','--nodes']+unreachable_nodes)
time.sleep(1)

for index,cluster in enumerate(cluster_to_build):
nodes=cluster[0]
instance_type = cluster[1]
queue=cluster[2]
jobID=str(cluster[3])
user=str(cluster[4])
jobconfig=getJobConfig(config,queue,instance_type)
limits=getQueueLimits(config,queue,instance_type)
try:
clusterCount=len(used_index[queue][instance_type])
except:
clusterCount=0
if clusterCount>=limits["max_cluster_count"]:
print ("This would go over the number of running clusters, you have reached the max number of clusters")
continue
nextIndex=None
if clusterCount==0:
if queue in used_index.keys():
used_index[queue][instance_type]=[1]
clusterCount=0
if clusterCount>=limits["max_cluster_count"]:
print ("This would go over the number of running clusters, you have reached the max number of clusters")
continue
nextIndex=None
if clusterCount==0:
if queue in used_index.keys():
used_index[queue][instance_type]=[1]
else:
used_index[queue]={instance_type:[1]}
nextIndex=1
else:
used_index[queue]={instance_type:[1]}
nextIndex=1
else:
for i in range(1,10000):
if not i in used_index[queue][instance_type]:
nextIndex=i
used_index[queue][instance_type].append(i)
break
clusterName=queue+'-'+str(nextIndex)+'-'+jobconfig["instance_keyword"]
if not queue in current_nodes.keys():
current_nodes[queue]={instance_type:0}
else:
if not instance_type in current_nodes[queue].keys():
current_nodes[queue][instance_type]=0
if not queue in building_nodes.keys():
building_nodes[queue]={instance_type:0}
else:
if not instance_type in building_nodes[queue].keys():
building_nodes[queue][instance_type]=0
if nodes > limits["max_cluster_size"]:
print ("Cluster "+clusterName+" won't be created, it would go over the total number of nodes per cluster limit")
elif current_nodes[queue][instance_type] + building_nodes[queue][instance_type] + nodes > limits["max_number_nodes"]:
print ("Cluster "+clusterName+" won't be created, it would go over the total number of nodes limit")
else:
current_nodes[queue][instance_type]+=nodes
clusterCount+=1
print ("Creating cluster "+clusterName+" with "+str(nodes)+" nodes")
subprocess.Popen([script_path+'/create_cluster.sh',str(nodes),clusterName,instance_type,queue,jobID,user])
time.sleep(5)
for i in range(1,10000):
if not i in used_index[queue][instance_type]:
nextIndex=i
used_index[queue][instance_type].append(i)
break
clusterName=queue+'-'+str(nextIndex)+'-'+jobconfig["instance_keyword"]
if not queue in current_nodes.keys():
current_nodes[queue]={instance_type:0}
else:
if not instance_type in current_nodes[queue].keys():
current_nodes[queue][instance_type]=0
if not queue in building_nodes.keys():
building_nodes[queue]={instance_type:0}
else:
if not instance_type in building_nodes[queue].keys():
building_nodes[queue][instance_type]=0
if nodes > limits["max_cluster_size"]:
print ("Cluster "+clusterName+" won't be created, it would go over the total number of nodes per cluster limit")
elif current_nodes[queue][instance_type] + building_nodes[queue][instance_type] + nodes > limits["max_number_nodes"]:
print ("Cluster "+clusterName+" won't be created, it would go over the total number of nodes limit")
else:
current_nodes[queue][instance_type]+=nodes
clusterCount+=1
print ("Creating cluster "+clusterName+" with "+str(nodes)+" nodes")
subprocess.Popen([script_path+'/create_cluster.sh',str(nodes),clusterName,instance_type,queue,jobID,user])
time.sleep(5)

except Exception:
traceback.print_exc()
os.remove(lockfile)
except Exception:
traceback.print_exc()
os.remove(lockfile)
else:
print("Autoscaling is false")
exit()
8 changes: 7 additions & 1 deletion autoscaling/tf_init/cluster-network-configuration.tf
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,13 @@ resource "oci_core_instance_configuration" "cluster-network-instance_configurati
name = "Compute HPC RDMA Auto-Configuration"
desired_state = plugins_config.value
}

}
dynamic plugins_config {
for_each = length(regexall(".*GPU.*", var.cluster_network_shape)) > 0 ? ["ENABLED"] : ["DISABLED"]
content {
name = "Compute RDMA GPU Monitoring"
desired_state = plugins_config.value
}
}
}
dynamic "platform_config" {
Expand Down
3 changes: 2 additions & 1 deletion autoscaling/tf_init/controller_update.tf
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,8 @@ resource "local_file" "inventory" {
compute_username = var.compute_username,
pam = var.pam,
sacct_limits = var.sacct_limits,
use_compute_agent=var.use_compute_agent
use_compute_agent=var.use_compute_agent,
healthchecks=var.healthchecks
})
filename = "${local.controller_path}/inventory"
}
Expand Down
17 changes: 16 additions & 1 deletion autoscaling/tf_init/instance-pool-configuration.tf
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,23 @@ resource "oci_core_instance_configuration" "instance_pool_configuration" {
user_data = base64encode(data.template_file.config.rendered)
}
agent_config {
is_management_disabled = true

are_all_plugins_disabled = false
is_management_disabled = true
is_monitoring_disabled = false

plugins_config {
desired_state = "DISABLED"
name = "OS Management Service Agent"
}
dynamic plugins_config {
for_each = length(regexall(".*GPU.*", var.instance_pool_shape)) > 0 ? ["ENABLED"] : ["DISABLED"]
content {
name = "Compute RDMA GPU Monitoring"
desired_state = plugins_config.value
}
}
}
shape = var.instance_pool_shape

dynamic "shape_config" {
Expand Down
3 changes: 2 additions & 1 deletion autoscaling/tf_init/inventory.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -73,4 +73,5 @@ pam = ${pam}
sacct_limits=${sacct_limits}
use_compute_agent=${use_compute_agent}
zone_name=${zone_name}
dns_entries=${dns_entries}
dns_entries=${dns_entries}
healthchecks=${healthchecks}
2 changes: 1 addition & 1 deletion autoscaling/tf_init/versions.tf
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ terraform {
required_providers {
oci = {
source = "oracle/oci"
version = "5.30.0"
version = "5.37.0"
}
}
}
2 changes: 1 addition & 1 deletion bin/delete_cluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ else
for node in `scontrol show hostname $nodes 2>&1`
do
echo "Cleaning up node " $node
/opt/oci-hpc/bin/resize.sh remove_unreachable --nodes $node
/opt/oci-hpc/bin/resize.sh remove_unreachable --nodes $node --quiet
done
fi
cd
Expand Down
4 changes: 4 additions & 0 deletions bin/remove_nodes_prompt.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Does your cluster run any file system like Ceph, NFS, etc. on the GPU/HPC nodes itself using local NVMe SSDs?
If yes, terminating nodes which store your data can result in permanent data loss, so before proceeding make sure any important data is copied to a persistent file system outside of the cluster such as to object storage, file storage, etc.
Once data is backed up or migrated, come back and run the script. Select 2 to exit.
Remember, once the nodes are terminated, all the data is lost forever and you won't be able to recover it.
7 changes: 6 additions & 1 deletion bin/resize.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,7 +447,10 @@ def getNFSnode(inventory):
return ''
if len(dict['nfs']) == 0:
return ''
return dict['nfs'][0].split()[0]
if dict['nfs'][0] == '\n':
return ''
else:
return dict['nfs'][0].split()[0]

def get_summary(comp_ocid,cluster_name):
CN = "CN"
Expand Down Expand Up @@ -577,6 +580,7 @@ def getLaunchInstanceDetails(instance,comp_ocid,cn_ocid,max_previous_index,index
parser.add_argument('--force', help='If present. Nodes will be removed even if the destroy playbook failed',action='store_true',default=False)
parser.add_argument('--ansible_crucial', help='If present during reconfiguration, only crucial ansible playbooks will be executed on the live nodes. Non live nodes will be removed',action='store_true',default=False)
parser.add_argument('--remove_unreachable', help='If present, nodes that are not sshable will be terminated before running the action that was requested (Example Adding a node) ',action='store_true',default=False)
parser.add_argument('--quiet', help='If present, the script will not prompt for a response when removing nodes and will not give a reminder to save data from nodes that are being removed ',action='store_true',default=False)

args = parser.parse_args()

Expand Down Expand Up @@ -753,6 +757,7 @@ def getLaunchInstanceDetails(instance,comp_ocid,cn_ocid,max_previous_index,index
if len(unreachable_instances):
if not remove_unreachable:
print("STDOUT: At least one unreachable node is in the inventory")
print(unreachable_instances)
print("STDOUT: Not doing anything")
exit(1)
else:
Expand Down
Loading

0 comments on commit 382c496

Please sign in to comment.