Workers / Mode: Compute Clusters
Create self-managed HPC Compute Clusters.
A compute cluster is a group of high performance computing (HPC), GPU, or optimized instances that are connected with a high-bandwidth, ultra low-latency network.
- BM.GPU.A100-v2.8
- BM.GPU.H100.8
- BM.GPU4.8
- BM.HPC2.36
- BM.Optimized3.36
Configured with mode = "compute-cluster"
on a worker_pools
entry, or with worker_pool_mode = "compute-cluster"
to use as the default for all pools unless otherwise specified.
Compute clusters shared by multiple worker groups must be created using the variable worker_compute_clusters
and should be referenced by the key in the compute_cluster
attribute of the worker group.
If the worker_compute_clusters
is not specified, the module will create a compute cluster per each worker group.
Usage
worker_compute_clusters = { # Use this variable to define a compute cluster you intend to use with multiple-nodepools.
"shared" = {
placement_ad = 1
}
}
worker_pools = {
oke-vm-standard = {
description = "Managed node pool for operational workloads without GPU toleration"
mode = "node-pool",
size = 1,
shape = "VM.Standard.E4.Flex",
ocpus = 2,
memory = 16,
boot_volume_size = 50,
},
compute-cluster-group-1 = {
shape = "BM.HPC2.36",
boot_volume_size = 100,
image_id = "ocid1.image.oc1..."
image_type = "custom"
mode = "compute-cluster"
compute_cluster = "shared"
instance_ids = ["1", "2", "3"] # List of instance IDs in the compute cluster. Each instance ID corresponds to a separate node in the cluster.
placement_ad = "1"
cloud_init = [
{
content = <<-EOT
#!/usr/bin/env bash
echo "Pool-specific cloud_init using shell script"
EOT
},
],
secondary_vnics = {
"vnic-display-name" = {
nic_index = 1,
subnet_id = "ocid1.subnet..."
},
},
}
compute-cluster-group-2 = {
shape = "BM.HPC2.36",
boot_volume_size = 100,
image_id = "ocid1.image.oc1..."
image_type = "custom"
mode = "compute-cluster"
compute_cluster = "shared"
instance_ids = ["a", "b", "c"] # List of instance IDs in the compute cluster. Each instance ID corresponds to a separate node in the cluster.
placement_ad = "1"
cloud_init = [
{
content = <<-EOT
#!/usr/bin/env bash
echo "Pool-specific cloud_init using shell script"
EOT
},
],
secondary_vnics = {
"vnic-display-name" = {
nic_index = 1,
subnet_id = "ocid1.subnet..."
},
},
}
compute-cluster-group-3 = {
shape = "BM.HPC2.36",
boot_volume_size = 100,
image_id = "ocid1.image.oc1..."
image_type = "custom"
mode = "compute-cluster"
instance_ids = ["001", "002", "003"] # List of instance IDs in the compute cluster. Each instance ID corresponds to a separate node in the cluster.
placement_ad = "1"
cloud_init = [
{
content = <<-EOT
#!/usr/bin/env bash
echo "Pool-specific cloud_init using shell script"
EOT
},
],
}
}
Instance agent configuration:
worker_pools = {
oke-instance = {
agent_config = {
are_all_plugins_disabled = false,
is_management_disabled = false,
is_monitoring_disabled = false,
plugins_config = {
"Bastion" = "DISABLED",
"Block Volume Management" = "DISABLED",
"Compute HPC RDMA Authentication" = "DISABLED",
"Compute HPC RDMA Auto-Configuration" = "DISABLED",
"Compute Instance Monitoring" = "ENABLED",
"Compute Instance Run Command" = "ENABLED",
"Compute RDMA GPU Monitoring" = "DISABLED",
"Custom Logs Monitoring" = "ENABLED",
"Management Agent" = "ENABLED",
"Oracle Autonomous Linux" = "DISABLED",
"OS Management Service Agent" = "DISABLED",
}
}
},
}