Skip to content

Commit

Permalink
Merge pull request #44 from oracle-quickstart/2.11.0
Browse files Browse the repository at this point in the history
2.11.0 Release
  • Loading branch information
arnaudfroidmont authored Oct 16, 2024
2 parents 382c496 + e5ee24e commit 6c9652e
Show file tree
Hide file tree
Showing 158 changed files with 45,035 additions and 2,601 deletions.
25 changes: 15 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ Allow dynamic-group instance_principal to manage compute-management-family in co
Allow dynamic-group instance_principal to manage instance-family in compartment compartmentName
Allow dynamic-group instance_principal to use virtual-network-family in compartment compartmentName
Allow dynamic-group instance_principal to use volumes in compartment compartmentName
Allow dynamic-group instance_principal to manage dns in compartment compartmentName
```
or:

Expand All @@ -34,12 +35,9 @@ The stack allowa various combination of OS. Here is a list of what has been test

| Controller | Compute |
|---------------|--------------|
| OL7 | OL7 |
| OL7 | OL8 |
| OL7 | CentOS7 |
| OL8 | OL8 |
| OL8 | OL7 |
| Ubuntu 20.04 | Ubuntu 20.04 |
| OL8 | OL8 |
| OL8 | OL7 |
| Ubuntu 22.04 | Ubuntu 22.04 |

When switching to Ubuntu, make sure the username is changed from opc to Ubuntu in the ORM for both the controller and compute nodes.
## How is resizing different from autoscaling ?
Expand Down Expand Up @@ -276,10 +274,6 @@ Example:
```
/opt/oci-hpc/bin/create_cluster.sh 4 compute2-1-hpc HPC_instance compute2
```
The name of the cluster must be
queueName-clusterNumber-instanceType_keyword

The keyword will need to match the one from /opt/oci-hpc/conf/queues.conf to be registered in Slurm

### Cluster Deletion:
```
Expand Down Expand Up @@ -422,3 +416,14 @@ By default, this check box is enabled. By selecting, this check-box, a PAR would
Step 2: Use shell script: upload_rdma_nic_metrics.sh to collect metrics and upload to object storage.
User needs to use shell script: upload_rdma_nic_metrics.sh to collect metrics and upload to object storage. User could configure metrics
collection limit and interval through config file: rdma_metrics_collection_config.conf.

## Meshpinger

Meshpinger is a tool for validating network layer connectivity between RDMA NICs on a cluster network in OCI. The tool is capable of initiating ICMP ping from every RDMA NIC port on the cluster network to every other RDMA NIC port on the same cluster network and
reporting back the success/failure status of the pings performed in the form of logs

Running the tool before starting workload on a cluster network should serve as a good precheck step to gain confidence on the network reachability between RDMA NICs. Typical causes for reachability failures that the tool can help pinpoint are,
1. Link down on the RDMA NIC
2. RDMA interface initialization or configuration issues including IP address assignment to
the interface
3. Insufficient ARP table size on the node to store all needed peer mac addresses
93 changes: 51 additions & 42 deletions autoscaling/crontab/autoscale_slurm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -111,12 +111,12 @@ def getDefaultsConfig(config,queue_name):
for instance_type in partition["instance_types"]:
if "default" in instance_type.keys():
if instance_type["default"]:
return {"queue":partition["name"], "instance_type":instance_type["name"], "shape":instance_type["shape"], "cluster_network":instance_type["cluster_network"], "instance_keyword":instance_type["instance_keyword"]}
return {"queue":partition["name"], "instance_type":instance_type["name"], "shape":instance_type["shape"], "cluster_network":instance_type["cluster_network"], "hostname_convention":instance_type["hostname_convention"]}
if len(partition["instance_types"])>0:
instance_type=partition["instance_types"][0]
print ("No default configuration was found, there may be a problem in your queues.conf file")
print ("Selecting "+instance_type["name"]+" as default")
return {"queue":partition["name"], "instance_type":instance_type["name"], "shape":instance_type["shape"], "cluster_network":instance_type["cluster_network"], "instance_keyword":instance_type["instance_keyword"]}
return {"queue":partition["name"], "instance_type":instance_type["name"], "shape":instance_type["shape"], "cluster_network":instance_type["cluster_network"], "hostname_convention":instance_type["hostname_convention"]}
print ("The queue "+queue_name+" was not found in the queues.conf file")
return None

Expand All @@ -125,7 +125,7 @@ def getJobConfig(config,queue_name,instance_type_name):
if queue_name == partition["name"]:
for instance_type in partition["instance_types"]:
if instance_type_name == instance_type["name"]:
return {"queue":partition["name"], "instance_type":instance_type["name"], "shape":instance_type["shape"], "cluster_network":instance_type["cluster_network"], "instance_keyword":instance_type["instance_keyword"]}
return {"queue":partition["name"], "instance_type":instance_type["name"], "shape":instance_type["shape"], "cluster_network":instance_type["cluster_network"], "hostname_convention":instance_type["hostname_convention"]}
return None

def getQueueLimits(config,queue_name,instance_type_name):
Expand All @@ -136,11 +136,11 @@ def getQueueLimits(config,queue_name,instance_type_name):
return {"max_number_nodes": int(instance_type["max_number_nodes"]), "max_cluster_size": int(instance_type["max_cluster_size"]),"max_cluster_count": int(instance_type["max_cluster_count"])}
return {"max_number_nodes": 0, "max_cluster_size": 0,"max_cluster_count": 0}

def getInstanceType(config,queue_name,instance_keyword):
def getInstanceType(config,queue_name,hostname_convention):
for partition in config:
if queue_name == partition["name"]:
for instance_type in partition["instance_types"]:
if instance_keyword == instance_type["instance_keyword"]:
if hostname_convention == instance_type["hostname_convention"]:
return instance_type["name"]
return None

Expand All @@ -161,26 +161,33 @@ def getAllClusterNames(config):
return availableNames

def getClusterName(node):
out = subprocess.Popen(['scontrol','show','topology',node], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True)
stdout,stderr = out.communicate()
clusterName = None
try:
if len(stdout.split('\n')) > 2:
for output in stdout.split('\n')[:-1]:
if "Switches=" in output:
clusterName=output.split()[0].split('SwitchName=')[1]
break
elif "SwitchName=inactive-" in output:
continue
else:
clusterName=output.split()[0].split('SwitchName=')[1]
elif len(stdout.split('\n')) == 2:
clusterName=stdout.split('\n')[0].split()[0].split('SwitchName=')[1]
if clusterName.startswith("inactive-"):
details=getNodeDetails(node)
clusterName="NOCLUSTERFOUND"
for feature in details[0].split(","):
if feature.startswith('CN__'):
clusterName=feature[4:]
if clusterName == "NOCLUSTERFOUND":
out = subprocess.Popen(['scontrol','show','topology',node], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True)
stdout,stderr = out.communicate()
clusterName = None
try:
if len(stdout.split('\n')) > 2:
for output in stdout.split('\n')[:-1]:
if "Switches=" in output:
clusterName=output.split()[0].split('SwitchName=')[1]
break
elif "SwitchName=inactive-" in output:
continue
else:
clusterName=output.split()[0].split('SwitchName=')[1]
elif len(stdout.split('\n')) == 2:
clusterName=stdout.split('\n')[0].split()[0].split('SwitchName=')[1]
if clusterName.startswith("inactive-"):
return "NOCLUSTERFOUND"
except:
print('No ClusterName could be found for '+node)
print('There seems to be some issues in the slurm topology file')
return "NOCLUSTERFOUND"
except:
print('No ClusterName could be found for '+node)
return "NOCLUSTERFOUND"
return clusterName

def getstatus_slurm():
Expand Down Expand Up @@ -246,7 +253,7 @@ def getstatus_slurm():
clustername=getClusterName(node)
if clustername is None:
continue
instanceType=features[-1]
instanceType=features[0]
if queue in current_nodes.keys():
if instanceType in current_nodes[queue].keys():
current_nodes[queue][instanceType]+=1
Expand Down Expand Up @@ -276,7 +283,9 @@ def getstatus_slurm():
cluster_to_destroy=[]
for clustername in nodes_to_destroy_temp.keys():
destroyEntireCluster=True
if clustername in running_cluster or clustername == "NOCLUSTERFOUND":
if clustername == "NOCLUSTERFOUND":
destroyEntireCluster=False
elif clustername in running_cluster:
nodes_to_destroy[clustername]=nodes_to_destroy_temp[clustername]
destroyEntireCluster=False
else:
Expand All @@ -295,10 +304,10 @@ def getstatus_slurm():
for clusterName in os.listdir(clusters_path):
if len(clusterName.split('-')) < 3:
continue
instance_keyword='-'.join(clusterName.split('-')[2:])
hostname_convention='-'.join(clusterName.split('-')[2:])
clusterNumber=int(clusterName.split('-')[1])
queue=clusterName.split('-')[0]
instanceType=getInstanceType(config,queue,instance_keyword)
instanceType=getInstanceType(config,queue,hostname_convention)
if not queue in used_index.keys():
used_index[queue]={}
if not instanceType in used_index[queue].keys():
Expand All @@ -311,19 +320,19 @@ def getstatus_slurm():
nodes = line.split()[0]
instance_type = line.split()[1]
queue = line.split()[2]
try:
cluster_building.append([int(nodes),instance_type,queue])
if queue in building_nodes.keys():
if instance_type in building_nodes[queue].keys():
building_nodes[queue][instance_type]+=int(nodes)
try:
cluster_building.append([int(nodes),instance_type,queue])
if queue in building_nodes.keys():
if instance_type in building_nodes[queue].keys():
building_nodes[queue][instance_type]+=int(nodes)
else:
building_nodes[queue][instance_type]=int(nodes)
else:
building_nodes[queue][instance_type]=int(nodes)
else:
building_nodes[queue]={instance_type:int(nodes)}
except ValueError:
print ('The cluster '+ clusterName + ' does not have a valid entry for \"currently_building\"')
print ('Ignoring')
continue
building_nodes[queue]={instance_type:int(nodes)}
except ValueError:
print ('The cluster '+ clusterName + ' does not have a valid entry for \"currently_building\"')
print ('Ignoring')
continue
if os.path.isfile(os.path.join(clusters_path,clusterName,'currently_destroying')):
cluster_destroying.append(clusterName)
return cluster_to_build,cluster_to_destroy,nodes_to_destroy,cluster_building,cluster_destroying,used_index,current_nodes,building_nodes
Expand Down Expand Up @@ -422,7 +431,7 @@ if autoscaling == "true":
nextIndex=i
used_index[queue][instance_type].append(i)
break
clusterName=queue+'-'+str(nextIndex)+'-'+jobconfig["instance_keyword"]
clusterName=queue+'-'+str(nextIndex)+'-'+jobconfig["hostname_convention"]
if not queue in current_nodes.keys():
current_nodes[queue]={instance_type:0}
else:
Expand All @@ -448,5 +457,5 @@ if autoscaling == "true":
traceback.print_exc()
os.remove(lockfile)
else:
print("Autoscaling is false")
print("Autoscaling is false (set in /etc/ansible/hosts)")
exit()
2 changes: 2 additions & 0 deletions autoscaling/tf_init/cluster-network.tf
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ resource "oci_core_cluster_network" "cluster_network" {
}
freeform_tags = {
"user" = var.tags
"cluster_name" = local.cluster_name
"parent_cluster" = local.cluster_name
}
placement_configuration {
availability_domain = var.ad
Expand Down
1 change: 1 addition & 0 deletions autoscaling/tf_init/compute-cluster.tf
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ resource "oci_core_compute_cluster" "compute_cluster" {
#Optional
display_name = local.cluster_name
freeform_tags = {
"user" = var.tags
"cluster_name" = local.cluster_name
"parent_cluster" = local.cluster_name
}
Expand Down
12 changes: 8 additions & 4 deletions autoscaling/tf_init/controller_update.tf
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@

locals {
controller_path = "${var.autoscaling_folder}/clusters/${var.cluster_name}"
controller_path = "${var.autoscaling_folder}/clusters/${local.cluster_name}"
}

resource "null_resource" "create_path" {
Expand All @@ -12,7 +12,7 @@ resource "null_resource" "create_path" {
resource "local_file" "hosts" {
depends_on = [null_resource.create_path,oci_core_cluster_network.cluster_network]
content = join("\n", local.cluster_instances_ips)
filename = "${local.controller_path}/hosts_${var.cluster_name}"
filename = "${local.controller_path}/hosts_${local.cluster_name}"
}

resource "local_file" "inventory" {
Expand All @@ -24,6 +24,8 @@ resource "local_file" "inventory" {
backup_ip = var.backup_ip,
login_name = var.login_name,
login_ip = var.login_ip,
monitoring_name = var.monitoring_name,
monitoring_ip = var.monitoring_ip,
compute = var.node_count > 0 ? zipmap(local.cluster_instances_names, local.cluster_instances_ips) : zipmap([],[])
public_subnet = var.public_subnet,
private_subnet = var.private_subnet,
Expand Down Expand Up @@ -66,7 +68,7 @@ resource "local_file" "inventory" {
instance_pool_ocpus=local.instance_pool_ocpus,
queue=var.queue,
instance_type=var.instance_type,
monitoring=var.monitoring,
cluster_monitoring=var.cluster_monitoring,
autoscaling_monitoring = var.autoscaling_monitoring,
unsupported = var.unsupported,
hyperthreading = var.hyperthreading,
Expand All @@ -78,7 +80,9 @@ resource "local_file" "inventory" {
pam = var.pam,
sacct_limits = var.sacct_limits,
use_compute_agent=var.use_compute_agent,
healthchecks=var.healthchecks
healthchecks=var.healthchecks,
change_hostname=var.change_hostname,
hostname_convention=var.hostname_convention
})
filename = "${local.controller_path}/inventory"
}
Expand Down
2 changes: 2 additions & 0 deletions autoscaling/tf_init/instance-pool.tf
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ resource "oci_core_instance_pool" "instance_pool" {
display_name = local.cluster_name
freeform_tags = {
"user" = var.tags
"cluster_name" = local.cluster_name
"parent_cluster" = local.cluster_name
}
placement_configurations {
availability_domain = var.ad
Expand Down
8 changes: 6 additions & 2 deletions autoscaling/tf_init/inventory.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ ${controller_name} ansible_host=${controller_ip} ansible_user=${controller_usern
%{ if backup_name != "" }${backup_name} ansible_host=${backup_ip} ansible_user=${controller_username} role=controller%{ endif }
[login]
%{ if login_name != "" }${login_name} ansible_host=${login_ip} ansible_user=${compute_username} role=login%{ endif }
[monitoring]
%{ if monitoring_name != "" }${monitoring_name} ansible_host=${monitoring_ip} ansible_user=${compute_username} role=monitoring%{ endif }
[compute_to_add]
[compute_configured]
%{ for host, ip in compute ~}
Expand Down Expand Up @@ -62,7 +64,7 @@ log_vol=${log_vol}
ldap=${ldap}
queue=${queue}
instance_type=${instance_type}
monitoring=${monitoring}
cluster_monitoring=${cluster_monitoring}
hyperthreading=${hyperthreading}
privilege_sudo=${privilege_sudo}
privilege_group_name=${privilege_group_name}
Expand All @@ -74,4 +76,6 @@ sacct_limits=${sacct_limits}
use_compute_agent=${use_compute_agent}
zone_name=${zone_name}
dns_entries=${dns_entries}
healthchecks=${healthchecks}
healthchecks=${healthchecks}
change_hostname=${change_hostname}
hostname_convention=${hostname_convention}
2 changes: 1 addition & 1 deletion autoscaling/tf_init/locals.tf
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,6 @@ locals {

timeout_per_batch= var.cluster_network ? var.use_multiple_ads ? 15 : 30 : var.use_multiple_ads ? 6 : 15
timeout_ip = join("",[ (( var.node_count - ( var.node_count % 20 ) )/20 + 1 ) * local.timeout_per_batch,"m"])
platform_type = local.shape == "BM.GPU4.8" ? "AMD_ROME_BM_GPU" : local.shape == "BM.GPU.B4.8" || local.shape == "BM.GPU.H100.8" || local.shape == "BM.GPU.A100-v2.8" ? "AMD_MILAN_BM_GPU" : local.shape == "BM.Standard.E3.128" ? "AMD_ROME_BM" : local.shape == "BM.Standard.E4.128" || local.shape == "BM.DenseIO.E4.128" ? "AMD_MILAN_BM" : "GENERIC_BM"
platform_type = local.shape == "BM.GPU4.8" ? "AMD_ROME_BM_GPU" : local.shape == "BM.GPU.B4.8" || local.shape == "BM.GPU.A100-v2.8" ? "AMD_MILAN_BM_GPU" : local.shape == "BM.Standard.E3.128" ? "AMD_ROME_BM" : local.shape == "BM.Standard.E4.128" || local.shape == "BM.DenseIO.E4.128" ? "AMD_MILAN_BM" : "GENERIC_BM"

}
4 changes: 2 additions & 2 deletions autoscaling/tf_init/network.tf
Original file line number Diff line number Diff line change
Expand Up @@ -183,10 +183,10 @@ resource "oci_dns_rrset" "rrset-cluster-network-SLURM" {

for_each = var.slurm && var.dns_entries ? toset([for v in range(var.node_count) : tostring(v)]) : []
zone_name_or_id = data.oci_dns_zones.dns_zones.zones[0].id
domain = "${var.queue}-${var.instance_type}-${local.cluster_instances_ips_index[tonumber(each.key)]}.${var.zone_name}"
domain = "${var.hostname_convention}-${local.cluster_instances_ips_index[tonumber(each.key)]}.${var.zone_name}"
rtype = "A"
items {
domain = "${var.queue}-${var.instance_type}-${local.cluster_instances_ips_index[tonumber(each.key)]}.${var.zone_name}"
domain = "${var.hostname_convention}-${local.cluster_instances_ips_index[tonumber(each.key)]}.${var.zone_name}"
rtype = "A"
rdata = "${local.cluster_instances_ips[tonumber(each.key)]}"
ttl = 3600
Expand Down
4 changes: 2 additions & 2 deletions autoscaling/tf_init/versions.tf
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
terraform {
required_version = ">= 1.0"
required_version = ">= 1.2"
required_providers {
oci = {
source = "oracle/oci"
version = "5.37.0"
version = "6.9.0"
}
}
}
Loading

0 comments on commit 6c9652e

Please sign in to comment.