################################ ## Cluster Configuration File ## ################################ [cluster Slurm] FormLayout = selectionpanel Category = Schedulers Autoscale = $Autoscale [[node defaults]] UsePublicNetwork = $UsePublicNetwork Credentials = $Credentials SubnetId = $SubnetId Region = $Region KeyPairLocation = ~/.ssh/cyclecloud.pem Azure.Identities = $ManagedIdentity Tags = $NodeTags # Lustre mounts require termination notifications to unmount EnableTerminateNotification = ${NFSType == "lustre" || NFSSchedType == "lustre" || AdditionalNFSType == "lustre" || EnableTerminateNotification} TerminateNotificationTimeout = 10m [[[configuration]]] slurm.version = $configuration_slurm_version slurm.insiders = false slurm.user.uid = 11100 slurm.user.gid = 11100 munge.user.uid = 11101 munge.user.gid = 11101 slurm.slurmrestd.user.uid = 11102 slurm.slurmrestd.user.gid = 11102 slurm.enable_healthchecks = true slurm.accounting.enabled = $configuration_slurm_accounting_enabled slurm.accounting.url = $configuration_slurm_accounting_url slurm.accounting.user = $configuration_slurm_accounting_user slurm.accounting.password = $configuration_slurm_accounting_password slurm.accounting.certificate_url = $configuration_slurm_accounting_certificate_url slurm.accounting.storageloc = $configuration_slurm_accounting_storageloc slurm.additional.config = $additional_slurm_config slurm.ha_enabled = $configuration_slurm_ha_enabled slurm.launch_parameters = $configuration_slurm_launch_parameters # Disable ip-XXXXXXXX hostname generation cyclecloud.hosts.standalone_dns.enabled = ${NodeNameIsHostname==false} cyclecloud.hosts.simple_vpc_dns.enabled = ${NodeNameIsHostname==false} # For fast spin-up after Deallocate, force an immediate re-converge on boot cyclecloud.converge_on_boot = true # Disable normal NFS exports and mounts cyclecloud.mounts.sched.disabled = true cyclecloud.mounts.shared.disabled = true cyclecloud.exports.sched.disabled = true cyclecloud.exports.shared.disabled = true cyclecloud.exports.sched.samba.enabled = false cyclecloud.exports.shared.samba.enabled = false cyclecloud.exports.defaults.samba.enabled = false cshared.server.legacy_links_disabled = true # Monitoring cyclecloud.monitoring.enabled = $configuration_monitoring_enabled cyclecloud.monitoring.identity_client_id = $configuration_identity_client_id cyclecloud.monitoring.ingestion_endpoint = $configuration_ingestion_endpoint # May be used to identify the ID in cluster-init scripts cluster.identities.default = $ManagedIdentity [[[cluster-init cyclecloud/healthagent:default]]] [[[cluster-init cyclecloud/monitoring:default]]] [[[cluster-init cyclecloud/slurm:default:4.0.5]]] [[[volume boot]]] Size = ${ifThenElse(BootDiskSize > 0, BootDiskSize, undefined)} SSD = True [[[configuration cyclecloud.mounts.nfs_shared]]] type = $NFSType mountpoint = /shared export_path = ${ifThenElse(NFSType == "lustre", strcat("tcp:/lustrefs", NFSSharedExportPath), NFSSharedExportPath)} address = $NFSAddress options = $NFSSharedMountOptions [[[configuration cyclecloud.mounts.nfs_sched]]] type = $NFSSchedType mountpoint = /sched export_path = ${ifThenElse(NFSSchedType == "lustre", strcat("tcp:/lustrefs", NFSSchedExportPath), NFSSchedExportPath)} address = ${ifThenElse(UseBuiltinSched && !configuration_slurm_ha_enabled, undefined, NFSSchedAddress)} options = $NFSSchedMountOptions [[[configuration cyclecloud.mounts.additional_nfs]]] disabled = ${AdditionalNFS isnt true} type = $AdditionalNFSType address = $AdditionalNFSAddress mountpoint = $AdditionalNFSMountPoint export_path = ${ifThenElse(AdditionalNFSType == "lustre", strcat("tcp:/lustrefs", AdditionalNFSExportPath), AdditionalNFSExportPath)} options = $AdditionalNFSMountOptions [[node scheduler]] MachineType = $SchedulerMachineType ImageName = $SchedulerImageName IsReturnProxy = $ReturnProxy AdditionalClusterInitSpecs = $SchedulerClusterInitSpecs ComputerName = ${toLower(regexps("([^a-zA-Z0-9-])", ifThenElse(SchedulerHostName=="Cluster Prefix", StrJoin("-", ClusterName, "scheduler"), ifThenElse(Size(Trim(SchedulerHostName)) == 0 || SchedulerHostName == "Generated", undefined, SchedulerHostName)), "-"))} # indented version, for clarity. # ${toLower( # regexps("([^a-zA-Z0-9-])", # ifThenElse(SchedulerHostName=="Cluster Prefix", # StrJoin("-", ClusterName, "scheduler"), # ifThenElse(Size(Trim(SchedulerHostName)) == 0 || SchedulerHostName == "Generated", # undefined, # SchedulerHostName)), # "-"))} Zone = ${ifThenElse(configuration_slurm_ha_enabled, SchedulerZone, undefined)} [[[configuration]]] slurm.role = scheduler # Disable NFS mount of built-in /sched since it is a local volume mount: cyclecloud.mounts.builtinsched cyclecloud.mounts.nfs_sched.disabled = ${UseBuiltinSched && !configuration_slurm_ha_enabled} cyclecloud.mounts.nfs_shared.disabled = ${UseBuiltinShared && !configuration_slurm_ha_enabled} slurm.secondary_scheduler_name = ${ifThenElse(configuration_slurm_ha_enabled, "scheduler-ha-1", undefined)} [[[cluster-init cyclecloud/healthagent:default]]] [[[cluster-init cyclecloud/monitoring:default]]] [[[cluster-init cyclecloud/slurm:scheduler:4.0.5]]] [[[network-interface eth0]]] AssociatePublicIpAddress = $UsePublicNetwork [[[volume sched]]] Size = $SchedFilesystemSize SSD = True Mount = builtinsched Persistent = True Disabled = ${!UseBuiltinSched || configuration_slurm_ha_enabled} [[[volume shared]]] Size = $FilesystemSize SSD = True Mount = builtinshared Persistent = True Disabled = ${!UseBuiltinShared || configuration_slurm_ha_enabled} [[[configuration cyclecloud.mounts.builtinsched]]] disabled = ${!UseBuiltinSched || configuration_slurm_ha_enabled} mountpoint = /sched fs_type = xfs [[[configuration cyclecloud.mounts.builtinshared]]] disabled = ${!UseBuiltinShared || configuration_slurm_ha_enabled} mountpoint = /shared fs_type = xfs [[[configuration cyclecloud.exports.builtinsched]]] disabled = ${!UseBuiltinSched || configuration_slurm_ha_enabled} export_path = /sched options = no_root_squash samba.enabled = false type = nfs [[[configuration cyclecloud.exports.builtinshared]]] disabled = ${!UseBuiltinShared || configuration_slurm_ha_enabled} export_path = /shared samba.enabled = false type = nfs [[nodearray scheduler-ha]] Extends = scheduler IsReturnProxy = false InitialCount = $configuration_slurm_ha_enabled Zone = $SchedulerHAZone # Do not inherit property from node that is not used in nodearray # The equivalent is ComputerNamePrefix for nodearray, however Cluster-init will handle renaming of all hosts in a VMSS ComputerName := undefined [[[configuration]]] autoscale.enabled = false slurm.node_prefix = ${ifThenElse(NodeNamePrefix=="Cluster Prefix", StrJoin("-", ClusterName, ""), NodeNamePrefix)} slurm.use_nodename_as_hostname = $NodeNameIsHostname slurm.is_primary_scheduler = false [[nodearray login]] InitialCount = $NumberLoginNodes MaxCount = $MaxLoginNodeCount MachineType = $loginMachineType ImageName = $LoginImageName AdditionalClusterInitSpecs = $LoginClusterInitSpecs [[[cluster-init cyclecloud/healthagent:default]]] [[[cluster-init cyclecloud/monitoring:default]]] [[[cluster-init cyclecloud/slurm:login:4.0.5]]] [[[configuration]]] slurm.role = login autoscale.enabled = false slurm.node_prefix = ${ifThenElse(NodeNamePrefix=="Cluster Prefix", StrJoin("-", ClusterName, ""), NodeNamePrefix)} slurm.use_nodename_as_hostname = $NodeNameIsHostname [[node nodearraybase]] Abstract = true [[[configuration]]] slurm.role = execute slurm.autoscale = true slurm.node_prefix = ${ifThenElse(NodeNamePrefix=="Cluster Prefix", StrJoin("-", ClusterName, ""), NodeNamePrefix)} slurm.use_nodename_as_hostname = $NodeNameIsHostname [[[cluster-init cyclecloud/healthagent:default]]] [[[cluster-init cyclecloud/monitoring:default]]] [[[cluster-init cyclecloud/slurm:execute:4.0.5]]] [[[network-interface eth0]]] AssociatePublicIpAddress = $ExecuteNodesPublic [[nodearray hpc]] Extends = nodearraybase MachineType = $HPCMachineType ImageName = $HPCImageName MaxCount = $MaxHPCExecuteNodeCount Zone = ${ifThenElse(DefineNodesAvailabilityZone, HPCAvailabilityZone, undefined)} Azure.MaxScalesetSize = $HPCMaxScalesetSize AdditionalClusterInitSpecs = $HPCClusterInitSpecs EnableNodeHealthChecks = $EnableNodeHealthChecks [[[configuration]]] slurm.default_partition = true slurm.hpc = true slurm.partition = hpc [[nodearray htc]] Extends = nodearraybase MachineType = $HTCMachineType ImageName = $HTCImageName MaxCount = $MaxHTCExecuteNodeCount Zone = ${ifThenElse(DefineNodesAvailabilityZone, HTCAvailabilityZone, undefined)} Interruptible = $HTCUseLowPrio MaxPrice = $HTCSpotMaxPrice AdditionalClusterInitSpecs = $HTCClusterInitSpecs [[[configuration]]] slurm.hpc = false slurm.partition = htc # set pcpu = false for all hyperthreaded VMs slurm.use_pcpu = false [[nodearray gpu]] Extends = nodearraybase MachineType = $GPUMachineType ImageName = $GPUImageName MaxCount = $MaxGPUExecuteNodeCount Zone = ${ifThenElse(DefineNodesAvailabilityZone, GPUAvailabilityZone, undefined)} Azure.MaxScalesetSize = $HPCMaxScalesetSize EnableNodeHealthChecks = $EnableNodeHealthChecks Interruptible = $GPUUseLowPrio MaxPrice = $GPUSpotMaxPrice AdditionalClusterInitSpecs = $GPUClusterInitSpecs [[[configuration]]] slurm.default_partition = true slurm.hpc = true slurm.partition = gpu #Parameter to enable or disable IMEX service on a per-job basis #IMEX Support is enabled by default for GB200/GB300 but can be disabled by setting param to False #slurm.imex.enabled=True [[nodearray dynamic]] Extends = nodearraybase MachineType = $DynamicMachineType ImageName = $DynamicImageName MaxCoreCount = $MaxDynamicExecuteCoreCount Interruptible = $DynamicUseLowPrio MaxPrice = $DynamicSpotMaxPrice AdditionalClusterInitSpecs = $DynamicClusterInitSpecs [[[configuration]]] slurm.hpc = false # Slurm only allows a single feature to be defined in a Nodeset. If multiple features are defined here, only first value will be used for the nodeset. slurm.dynamic_feature := "dyn" # If this option is used, slurmd is started with this configuration for dynamic nodes. #slurm.dynamic_config := "-Z --conf \"Feature=dyn\"" # set pcpu = false for all hyperthreaded VMs slurm.use_pcpu = false slurm.autoscale = $EnableDynamicPartition [parameters About] Order = 1 [[parameters About Slurm]] [[[parameter slurm]]] HideLabel = true Config.Plugin = pico.widget.HtmlTemplateWidget Config.Template = '''
Slurm icon

Follow the instructions in theREADMEfor details on instructions on extending and configuring the Project for your environment.


Slurm is the most widely used workload manager in HPC, as the scheduler of choice for six of the top ten systems in the TOP500 and with market penetration of more than 70%. Slurm is an advanced, open-source scheduler designed to satisfy the demanding needs of high-performance computing (HPC), high-throughput computing (HTC), and artificial intelligence (AI).

Commercial Support provided by SchedMD

Get more from your HPC investment! SchedMD, the company behind Slurm development, can answer your Slurm questions and explain our options for consultation, training, support, and migration.

Contact SchedMD

View more details about Slurm?

Slurm at a glance

Slurm provides massive scalability and can easily manage performance requirements for small cluster, large cluster, and supercomputer needs. Slurm outperforms competitive schedulers with compute rates at:

  • 100K+ nodes/GPU
  • 17M+ jobs per day
  • 120M+ jobs per week

Slurm’s plug-in based architecture enables optimization and control in scheduling operations to meet organizational priorities. With first class resource management for GPUs, Slurm allows users to request GPU resources alongside CPUs. This flexibility ensures that jobs are executed quickly and efficiently, while maximizing resource utilization.


Other Slurm features include:

  • NVIDIA and AMD GPU support for AI, LLM, and ML environments
  • Advanced scheduling policies
  • Unique HPC, HTC, AI/ML workload expertise
  • Cloud bursting capabilities
  • Power saving capabilities, accounting, and reporting
  • Provided REST API daemon
  • Native support of containers
  • Tailored Slurm consulting and training available through SchedMD
''' [parameters Required Settings] Order = 10 [[parameters Virtual Machines]] Description = "The cluster, in this case, has two roles: the scheduler node with shared filer and the execute hosts. Configure which VM types to use based on the requirements of your application." Order = 20 [[[parameter Region]]] Label = Region Description = Deployment Location ParameterType = Cloud.Region [[[parameter SchedulerMachineType]]] Label = Scheduler VM Type Description = The VM type for scheduler node ParameterType = Cloud.MachineType DefaultValue = Standard_D4ads_v5 [[[parameter loginMachineType]]] Label = Login node VM Type Description = The VM type for login nodes. ParameterType = Cloud.MachineType DefaultValue = Standard_D8as_v4 [[[parameter HPCMachineType]]] Label = HPC VM Type Description = The VM type for HPC execute nodes ParameterType = Cloud.MachineType DefaultValue = Standard_F2s_v2 [[[parameter HTCMachineType]]] Label = HTC VM Type Description = The VM type for HTC execute nodes ParameterType = Cloud.MachineType DefaultValue = Standard_F2s_v2 [[[parameter GPUMachineType]]] Label = GPU VM Type Description = The VM type for HPC execute nodes ParameterType = Cloud.MachineType DefaultValue = Standard_NC24rs_v3 [[[parameter DynamicMachineType]]] Label = Dyn VM Type Description = The VM type for Dynamic execute nodes ParameterType = Cloud.MachineType DefaultValue = Standard_F2s_v2 Config.MultiSelect = true [[parameters Auto-Scaling]] Description = "The cluster can autoscale to the workload, adding execute hosts as jobs are queued. To enable this check the box below and choose the initial and maximum core counts for the cluster." Order = 30 [[[parameter Autoscale]]] Label = Autoscale DefaultValue = true Widget.Plugin = pico.form.BooleanCheckBox Widget.Label = Start and stop execute instances automatically [[[parameter MaxHTCExecuteNodeCount]]] Label = Max HTC Nodes Description = The total number of HTC execute nodes to start DefaultValue = 100 Config.Plugin = pico.form.NumberTextBox Config.MinValue = 0 Config.IntegerOnly = true [[[parameter MaxHPCExecuteNodeCount]]] Label = Max HPC Nodes Description = The total number of HPC execute nodes to start DefaultValue = 16 Config.Plugin = pico.form.NumberTextBox Config.MinValue = 0 Config.IntegerOnly = true [[[parameter MaxGPUExecuteNodeCount]]] Label = Max GPU Nodes Description = The total number of GPU execute nodes to start DefaultValue = 8 Config.Plugin = pico.form.NumberTextBox Config.MinValue = 0 Config.IntegerOnly = true [[[parameter MaxDynamicExecuteCoreCount]]] Label = Max Dyn Cores Description = The total number of Dynamic execute cores to start DefaultValue = 100 Config.Plugin = pico.form.NumberTextBox Config.MinValue = 1 Config.IntegerOnly = true [[[parameter HPCMaxScalesetSize]]] Label = Max VMs per VMSS Description = The maximum number of VMs created per VM Scaleset e.g. switch in Slurm. DefaultValue = 100 Config.Plugin = pico.form.NumberTextBox Config.MinValue = 1 Config.IntegerOnly = true [[[parameter HTCUseLowPrio]]] Label = HTC Spot DefaultValue = false Widget.Plugin = pico.form.BooleanCheckBox Widget.Label = Use Spot VMs for HTC execute hosts [[[parameter HTCSpotMaxPrice]]] Label = Max Price HTC DefaultValue = -1 Description = Max price for Spot VMs in USD (value of -1 will not evict based on price) Config.Plugin = pico.form.NumberTextBox Conditions.Excluded := HTCUseLowPrio isnt true Config.MinValue = -1 [[[parameter GPUUseLowPrio]]] Label = GPU Spot DefaultValue = false Widget.Plugin = pico.form.BooleanCheckBox Widget.Label = Use Spot VMs for GPU execute hosts [[[parameter GPUSpotMaxPrice]]] Label = Max Price GPU DefaultValue = -1 Description = Max price for Spot VMs in USD (value of -1 will not evict based on price) Config.Plugin = pico.form.NumberTextBox Conditions.Excluded := GPUUseLowPrio isnt true Config.MinValue = -1 [[[parameter DynamicUseLowPrio]]] Label = DynSpot DefaultValue = false Widget.Plugin = pico.form.BooleanCheckBox Widget.Label = Use Spot VMs for Dynamic execute hosts [[[parameter DynamicSpotMaxPrice]]] Label = Max Price Dyn DefaultValue = -1 Description = Max price for Spot VMs in USD (value of -1 will not evict based on price) Config.Plugin = pico.form.NumberTextBox Conditions.Excluded := DynamicUseLowPrio isnt true Config.MinValue = -1 [[[parameter DefineNodesAvailabilityZone]]] Label = Specify Availability Zones DefaultValue = false Widget.Plugin = pico.form.BooleanCheckBox Widget.Label = Specify availability zones for nodes [[[parameter HPCAvailabilityZone]]] Label = HPC Availability Zone ParameterType = StringList Conditions.Hidden := !DefineNodesAvailabilityZone Description = List of availability zones where HPC nodes will be spread [[[parameter HTCAvailabilityZone]]] Label = HTC Availability Zone ParameterType = StringList Conditions.Hidden := !DefineNodesAvailabilityZone Description = List of availability zones where HTC nodes will be spread [[[parameter GPUAvailabilityZone]]] Label = GPU Availability Zone ParameterType = StringList Conditions.Hidden := !DefineNodesAvailabilityZone Description = List of availability zones where GPU nodes will be spread [[[parameter NumberLoginNodes]]] Label = Num Login Nodes DefaultValue = 0 Description = Number of optional login nodes to create. Config.Plugin = pico.form.NumberTextBox Config.MinValue = 0 Config.MaxValue = 10000 Config.IntegerOnly = true [[[parameter MaxLoginNodeCount]]] Label = Max Login Nodes DefaultValue = 10 Description = Maximum number of login nodes to create Config.Plugin = pico.form.NumberTextBox Config.MinValue = 0 Config.MaxValue = 10000 Config.IntegerOnly = true [[parameters Networking]] Order = 40 [[[parameter SubnetId]]] Label = Subnet ID Description = Subnet Resource Path (ResourceGroup/VirtualNetwork/Subnet) ParameterType = Azure.Subnet Required = True [[parameters High Availability]] Order = 50 Description = "Slurm can be setup in HA mode - where slurmctld is running on two nodes with failover. Note that checking this box will require an external NFS, so any reference to the 'builtin' NFS will be hidden." [[[parameter configuration_slurm_ha_enabled]]] Label = Slurm HA Node Description = Deploy with an additional HA node DefaultValue = false ParameterType = Boolean [parameters Network Attached Storage] Order = 15 [[parameters Shared Storage]] Order = 10 [[[parameter About Shared Storage]]] HideLabel = true Config.Plugin = pico.widget.HtmlTemplateWidget Config.Template = '''

The directories /sched and /shared are network attached mounts and exist on all nodes of the cluster.

Options for providing these mounts:
[Builtin]: The scheduler node is an NFS server that provides the mountpoint to the other nodes of the cluster (not supported for HA configurations).
[External NFS]: A network attached storage such as Azure Netapp Files, HPC Cache, or another VM running an NFS server provides the mountpoint.
[Azure Managed Lustre]: An Azure Managed Lustre deployment provides the mountpoint.

Note: the cluster must be terminated for changes to filesystem mounts to take effect.

''' Conditions.Hidden := false [[parameters Scheduler Mount]] Order = 20 Label = File-system Mount for /sched [[[parameter About sched]]] HideLabel = true Config.Plugin = pico.widget.HtmlTemplateWidget Config.Template = '''

Slurm's configuration is linked in from the /sched directory. It is managed by the scheduler node.

''' Order = 6 [[[parameter About sched part 2]]] HideLabel = true Config.Plugin = pico.widget.HtmlTemplateWidget Config.Template = '''

Uncheck the box below to disable the built-in NFS export of the /sched directory and use an external file-system.

''' Order = 7 Conditions.Hidden := configuration_slurm_ha_enabled [[[parameter UseBuiltinSched]]] Label = Use Builtin NFS Description = Use the builtin NFS for /sched DefaultValue = true ParameterType = Boolean Conditions.Hidden := configuration_slurm_ha_enabled Disabled = configuration_slurm_ha_enabled [[[parameter NFSSchedDiskWarning]]] HideLabel = true Config.Plugin = pico.widget.HtmlTemplateWidget Config.Template := "

Warning: switching an active cluster over to NFS or Lustre from Builtin will delete the shared disk.

" Conditions.Hidden := UseBuiltinSched || configuration_slurm_ha_enabled [[[parameter NFSSchedType]]] Label = FS Type ParameterType = StringList Config.Label = Type of shared filesystem to use for this cluster Config.Plugin = pico.form.Dropdown Config.Entries := {[Label="External NFS"; Value="nfs"], [Label="Azure Managed Lustre"; Value="lustre"]} DefaultValue = nfs Conditions.Hidden := UseBuiltinSched && !configuration_slurm_ha_enabled [[[parameter NFSSchedAddress]]] Label = IP Address Description = The IP address or hostname of the NFS server or Lustre FS. Also accepts a list comma-separated addresses, for example, to mount a frontend load-balanced Azure HPC Cache. Config.ParameterType = String Conditions.Hidden := UseBuiltinSched && !configuration_slurm_ha_enabled [[[parameter NFSSchedExportPath]]] Label = Export Path Description = The path exported by the file system DefaultValue = /sched Conditions.Hidden := UseBuiltinSched && !configuration_slurm_ha_enabled [[[parameter NFSSchedMountOptions]]] Label = Mount Options Description = NFS Client Mount Options Conditions.Hidden := UseBuiltinSched && !configuration_slurm_ha_enabled [[[parameter SchedFilesystemSize]]] Label = Size (GB) Description = The filesystem size (cannot be changed after initial start) DefaultValue = 30 Config.Plugin = pico.form.NumberTextBox Config.MinValue = 10 Config.MaxValue = 10240 Config.IntegerOnly = true Conditions.Excluded := !UseBuiltinSched || configuration_slurm_ha_enabled [[parameters Default NFS Share]] Order = 30 Label = File-system Mount for /shared [[[parameter About shared]]] HideLabel = true Config.Plugin = pico.widget.HtmlTemplateWidget Config.Template = '''

Users' home directories reside within the /shared mountpoint with the base homedir /shared/home.

''' Order = 6 [[[parameter About shared part 2]]] HideLabel = true Config.Plugin = pico.widget.HtmlTemplateWidget Config.Template = '''

Uncheck the box below to disable the built-in NFS export of the /shared directory and use an external file-system.

''' Order = 7 Conditions.Hidden := configuration_slurm_ha_enabled [[[parameter UseBuiltinShared]]] Label = Use Builtin NFS Description = Use the builtin NFS for /shared DefaultValue = true ParameterType = Boolean Conditions.Hidden := configuration_slurm_ha_enabled Disabled = configuration_slurm_ha_enabled [[[parameter NFSDiskWarning]]] HideLabel = true Config.Plugin = pico.widget.HtmlTemplateWidget Config.Template := "

Warning: switching an active cluster over to NFS or Lustre from Builtin will delete the shared disk.

" Conditions.Hidden := UseBuiltinShared || configuration_slurm_ha_enabled [[[parameter NFSType]]] Label = FS Type ParameterType = StringList Config.Label = Type of shared filesystem to use for this cluster Config.Plugin = pico.form.Dropdown Config.Entries := {[Label="External NFS"; Value="nfs"], [Label="Azure Managed Lustre"; Value="lustre"]} DefaultValue = nfs Conditions.Hidden := UseBuiltinShared && !configuration_slurm_ha_enabled [[[parameter NFSAddress]]] Label = IP Address Description = The IP address or hostname of the NFS server or Lustre FS. Also accepts a list comma-separated addresses, for example, to mount a frontend load-balanced Azure HPC Cache. Config.ParameterType = String Conditions.Hidden := UseBuiltinShared && !configuration_slurm_ha_enabled [[[parameter NFSSharedExportPath]]] Label = Export Path Description = The path exported by the file system DefaultValue = /shared Conditions.Hidden := UseBuiltinShared && !configuration_slurm_ha_enabled [[[parameter NFSSharedMountOptions]]] Label = Mount Options Description = NFS Client Mount Options Conditions.Hidden := UseBuiltinShared && !configuration_slurm_ha_enabled [[[parameter FilesystemSize]]] Label = Size (GB) Description = The filesystem size (cannot be changed after initial start) DefaultValue = 100 Config.Plugin = pico.form.NumberTextBox Config.MinValue = 10 Config.MaxValue = 10240 Config.IntegerOnly = true Conditions.Excluded := !UseBuiltinShared || configuration_slurm_ha_enabled [[parameters Additional NFS Mount]] Order = 40 Label = Additional Filesystem Mount [[[parameter Additional Shared FS Mount Readme]]] HideLabel = true Config.Plugin = pico.widget.HtmlTemplateWidget Config.Template := "

Mount another shared file-system endpoint on the cluster nodes.

" Order = 20 [[[parameter AdditionalNFS]]] HideLabel = true DefaultValue = false Widget.Plugin = pico.form.BooleanCheckBox Widget.Label = Add Shared Filesystem mount [[[parameter AdditionalNFSType]]] Label = FS Type ParameterType = StringList Config.Label = Shared filesystem type of the additional mount Config.Plugin = pico.form.Dropdown Config.Entries := {[Label="External NFS"; Value="nfs"], [Label="Azure Managed Lustre"; Value="lustre"]} DefaultValue = nfs Conditions.Excluded := AdditionalNFS isnt true [[[parameter AdditionalNFSAddress]]] Label = IP Address Description = The IP address or hostname of the additional mount. Also accepts a list comma-separated addresses, for example, to mount a frontend load-balanced Azure HPC Cache. Config.ParameterType = String Conditions.Excluded := AdditionalNFS isnt true [[[parameter AdditionalNFSMountPoint]]] Label = Mount Point Description = The path at which to mount the Filesystem DefaultValue = /data Conditions.Excluded := AdditionalNFS isnt true [[[parameter AdditionalNFSExportPath]]] Label = Export Path Description = The path exported by the file system DefaultValue = /data Conditions.Excluded := AdditionalNFS isnt true [[[parameter AdditionalNFSMountOptions]]] Label = Mount Options Description = Filesystem Client Mount Options Conditions.Excluded := AdditionalNFS isnt true [parameters Advanced Settings] Order = 20 [[parameters Azure Settings]] Order = 10 [[[parameter Credentials]]] Description = The credentials for the cloud provider ParameterType = Cloud.Credentials [[[parameter ManagedIdentity]]] Label = Managed Id Description = Optionally assign an Azure user assigned managed identity to all nodes to access Azure resources using assigned roles. ParameterType = Azure.ManagedIdentity DefaultValue = =undefined [[[parameter BootDiskSize]]] Description = Optional: Size of the OS/boot disk in GB for all nodes in the cluster (leave at 0 to use Image size) ParameterType = Integer Config.Plugin = pico.form.NumberTextBox Config.MinValue = 0 Config.MaxValue = 32,000 Config.IntegerOnly = true Config.Increment = 64 DefaultValue = 0 [[[parameter EnableDynamicPartition]]] Label = Enable Dynamic Partition DefaultValue = true Widget.Plugin = pico.form.BooleanCheckBox Widget.Label = Create a partition for the dynamic nodearray [[[parameter NodeTags]]] Label = VM Tags Description = Tags applied to all nodes ParameterType = Record DefaultValue := [] Config.MultiSelect = false [[parameters Slurm Settings ]] Order = 5 [[[parameter slurm_version_warning]]] HideLabel = true Config.Plugin = pico.widget.HtmlTemplateWidget Config.Template := "
Note: For SLES HPC, we can only install the version supported by SLES HPC's zypper repos. At the time of this release, that is 23.02.7
" [[[parameter configuration_slurm_version]]] Required = True Label = Slurm Version Description = Version of Slurm to install on the cluster ParameterType = StringList Config.Plugin = pico.form.Dropdown Config.FreeForm = true Config.Entries := {[Value="25.05.5"]} DefaultValue = 25.05.5 [[[parameter configuration_slurm_accounting_enabled]]] Label = Job Accounting DefaultValue = false Widget.Plugin = pico.form.BooleanCheckBox Widget.Label = Configure Slurm job accounting [[[parameter slurm_database_warning]]] HideLabel = true Config.Plugin = pico.widget.HtmlTemplateWidget Conditions.Excluded := configuration_slurm_accounting_enabled isnt true Config.Template := "
Note: Checking this box will create persistent databases and tables in SQL DB provided. Deleting this cluster will not automatically delete those databases. User is responsible for periodically purging/archiving their slurm databases to maintain costs.
" [[[parameter configuration_slurm_accounting_url]]] Label = Database URL Description = URL of the database to use for Slurm job accounting Conditions.Excluded := configuration_slurm_accounting_enabled isnt true [[[parameter configuration_slurm_accounting_storageloc]]] Label = Database name Description = Database name to store slurm accounting records Conditions.Excluded := configuration_slurm_accounting_enabled isnt true [[[parameter configuration_slurm_accounting_user]]] Label = Database User Description = User for Slurm DBD admin Conditions.Excluded := configuration_slurm_accounting_enabled isnt true [[[parameter configuration_slurm_accounting_password]]] Label = Database Password Description = Password for Slurm DBD admin ParameterType = Password Conditions.Excluded := configuration_slurm_accounting_enabled isnt true [[[parameter configuration_slurm_accounting_certificate_url]]] Label = SSL Certificate URL Description = URL to fetch SSL certificate for authentication to DB. Use AzureCA.pem (embedded) for use with deprecated MariaDB instances. Conditions.Excluded := configuration_slurm_accounting_enabled isnt true ParameterType = StringList Config.Plugin = pico.form.Dropdown Config.FreeForm = true Config.Entries := {[Value=""], [Value="AzureCA.pem"]} DefaultValue = "" [[[parameter EnableTerminateNotification]]] Label = Enable Termination notifications DefaultValue = False [[[parameter additional_slurm_config]]] Label = Slurm Configuration Description = Any additional lines to add to slurm.conf ParameterType = Text [[[parameter configuration_slurm_launch_parameters]]] Label = Launch Parameters Description = Deploy Slurm with Launch Parameters (comma delimited) DefaultValue = 'use_interactive_step' ParameterType = String [[parameters Software]] Description = "Specify the scheduling software, and base OS installed on all nodes, and optionally the cluster-init and chef versions from your locker." Order = 10 [[[parameter NodeNameIsHostname]]] Label = Name As Hostname Description = Should the hostname match the nodename for execute nodes? ParameterType = Boolean DefaultValue = true [[[parameter NodeNamePrefix]]] Label = Node Prefix Description = Prefix for generated node names, i.e. "prefix-" generates prefix-nodearray-1. Use 'Cluster Prefix' to get $ClusterName-nodearray-1 ParameterType = StringList Config.Plugin = pico.form.Dropdown Config.FreeForm = true DefaultValue = "Cluster Prefix" Config.Entries := {[Value=""], [Value="Cluster Prefix"]} Conditions.Hidden := NodeNameIsHostname != true [[[parameter SchedulerHostName]]] Label = Scheduler Hostname Description = Hostname of scheduler. 'Generated' uses the default generated hostname. 'Cluster Prefix' will generate $ClusterName-scheduler. ParameterType = StringList Config.Plugin = pico.form.Dropdown Config.FreeForm = true DefaultValue = "Cluster Prefix" Config.Entries := {[Value="Generated"], [Value="Cluster Prefix"]} Conditions.Hidden := NodeNameIsHostname != true [[[parameter SchedulerImageName]]] Label = Scheduler OS ParameterType = Cloud.Image Config.OS = linux DefaultValue = cycle.image.ubuntu22 Config.Filter := Package in {"cycle.image.ubuntu24", "cycle.image.ubuntu22", "cycle.image.sles15-hpc", "almalinux8", "almalinux9"} [[[parameter LoginImageName]]] Label = Login Node OS ParameterType = Cloud.Image Config.OS = linux DefaultValue = cycle.image.ubuntu22 Config.Filter := Package in {"cycle.image.ubuntu24", "cycle.image.ubuntu22", "cycle.image.sles15-hpc", "almalinux8", "almalinux9"} [[[parameter HPCImageName]]] Label = HPC OS ParameterType = Cloud.Image Config.OS = linux DefaultValue = cycle.image.ubuntu22 Config.Filter := Package in {"cycle.image.ubuntu24", "cycle.image.ubuntu22", "cycle.image.sles15-hpc", "almalinux8", "almalinux9"} [[[parameter HTCImageName]]] Label = HTC OS ParameterType = Cloud.Image Config.OS = linux DefaultValue = cycle.image.ubuntu22 Config.Filter := Package in {"cycle.image.ubuntu24", "cycle.image.ubuntu22", "cycle.image.sles15-hpc", "almalinux8", "almalinux9"} [[[parameter GPUImageName]]] Label = GPU OS ParameterType = Cloud.Image Config.OS = linux DefaultValue = cycle.image.ubuntu22 Config.Filter := Package in {"cycle.image.ubuntu24", "cycle.image.ubuntu22", "cycle.image.sles15-hpc", "almalinux8", "almalinux9"} [[[parameter DynamicImageName]]] Label = Dynamic OS ParameterType = Cloud.Image Config.OS = linux DefaultValue = cycle.image.ubuntu22 Config.Filter := Package in {"cycle.image.ubuntu24", "cycle.image.ubuntu22", "cycle.image.sles15-hpc", "almalinux8", "almalinux9"} [[[parameter SchedulerClusterInitSpecs]]] Label = Scheduler Cluster-Init DefaultValue = =undefined Description = Cluster init specs to apply to the scheduler node ParameterType = Cloud.ClusterInitSpecs [[[parameter LoginClusterInitSpecs]]] Label = Login Cluster-Init DefaultValue = =undefined Description = Cluster init specs to apply to Login nodes ParameterType = Cloud.ClusterInitSpecs [[[parameter HTCClusterInitSpecs]]] Label = HTC Cluster-Init DefaultValue = =undefined Description = Cluster init specs to apply to HTC execute nodes ParameterType = Cloud.ClusterInitSpecs [[[parameter HPCClusterInitSpecs]]] Label = HPC Cluster-Init DefaultValue = =undefined Description = Cluster init specs to apply to HPC execute nodes ParameterType = Cloud.ClusterInitSpecs [[[parameter GPUClusterInitSpecs]]] Label = GPU Cluster-Init DefaultValue = =undefined Description = Cluster init specs to apply to GPU execute nodes ParameterType = Cloud.ClusterInitSpecs [[[parameter DynamicClusterInitSpecs]]] Label = Dyn Cluster-Init DefaultValue = =undefined Description = Cluster init specs to apply to Dynamic execute nodes ParameterType = Cloud.ClusterInitSpecs [[parameters Advanced Networking]] [[[parameter ReturnProxy]]] Label = Return Proxy DefaultValue = false ParameterType = Boolean Config.Label = Use SSH tunnel to connect to CycleCloud (required if direct access is blocked) [[[parameter UsePublicNetwork]]] Label = Public Head Node DefaultValue = false ParameterType = Boolean Config.Label = Access scheduler node from the Internet [[[parameter ExecuteNodesPublic]]] Label = Public Execute DefaultValue = false ParameterType = Boolean Config.Label = Access execute nodes from the Internet Conditions.Excluded := UsePublicNetwork isnt true [[[parameter SchedulerZone]]] Label = Scheduler Zone Description = The availability zone in which to deploy the scheduler node. DefaultValue = =undefined Config.Plugin = pico.form.Dropdown Config.Entries := {[Value=1], [Value=2], [Value=3], [Value=undefined; Label="Any"]} [[[parameter SchedulerHAZone]]] Label = Scheduler HA Zone Description = The availability zone in which to deploy the scheduler-ha node. DefaultValue = =undefined Config.Plugin = pico.form.Dropdown Config.Entries := {[Value=1], [Value=2], [Value=3], [Value=undefined; Label="Any"]} Conditions.Hidden := configuration_slurm_ha_enabled isnt true [[parameters Node Health Checks]] Description = "Section for configuring Node Health Checks" Order = 12 [[[parameter EnableNodeHealthChecks]]] Label = Enable NHC tests DefaultValue = false Widget.Plugin = pico.form.BooleanCheckBox Widget.Label = Run Node Health Checks on startup [parameters Monitoring] Order = 25 [[parameters Cyclecloud Monitoring]] Order = 10 [[[parameter monitoring_description]]] HideLabel = true Config.Plugin = pico.widget.HtmlTemplateWidget Config.Template =

Monitoring CycleCloud clusters with Prometheus and Grafana. Follow this README for details on setting up Managed Monitoring Infrastructure.

[[[parameter configuration_monitoring_enabled]]] Label = Monitoring DefaultValue = false Widget.Plugin = pico.form.BooleanCheckBox Widget.Label = Enable Monitoring [[[parameter configuration_identity_client_id]]] Label = Client ID Description = Client ID of the Managed Identity with Monitoring Metrics Publisher role Conditions.Excluded := configuration_monitoring_enabled isnt true Conditions.Required := configuration_monitoring_enabled is true [[[parameter configuration_ingestion_endpoint]]] Label = Ingestion Endpoint Description = The Azure Monitor Workspace in which to push metrics Conditions.Excluded := configuration_monitoring_enabled isnt true Conditions.Required := configuration_monitoring_enabled is true