AWSTemplateFormatVersion: '2010-09-09'
Description: >
  Recipe to configure an HPC cluster using nested stacks.

### Stack metadata
Metadata:
  AWS::CloudFormation::Interface:
    ParameterGroups:
      - Label:
          default: Cluster Design
        Parameters:
          - OS
          - Architecture
          - ComputeInstanceMax
      - Label:
          default: Networking and Access
        Parameters:
          - AvailabilityZoneName
      - Label:
          default: Storage Configuration
        Parameters:
          - FilesystemCapacityGb

Conditions:
  GovCloud: !Equals [!Ref AWS::Partition, 'aws-us-gov']
  China: !Equals [!Ref AWS::Partition, 'aws-cn']

Parameters:

  ComputeInstanceMax:
    Description: Maximum number of compute instances in the queue
    Type: Number
    MinValue: 1
    Default: 4

  OS:
    Type: String
    Default: alinux2
    AllowedValues:
      - alinux2
      - centos7
      - ubuntu2004
      - ubuntu2204
      - rhel8
    Description: Cluster operating system

  Architecture:
    Type: String
    Default: Graviton
    AllowedValues:
      - Graviton
      - GPU
      - x86
    Description: Choose a node architecture.

  AvailabilityZoneName:
    Description: Name of the Availability Zone where you wish to lauch your cluster
    Type: AWS::EC2::AvailabilityZone::Name

  FilesystemCapacityGb:
    Type: Number
    Description: Size (in GB) of FSx for Lustre volume at /fsx
    Default: 1200
    AllowedValues:
         - 1200
         - 2400
         - 4800
         - 7200
         - 9600

Mappings:
  ParallelCluster:
    Constants:
      Version: 3.7.2
  Recipes:
    Constants:
      Version: main
  InstanceTypeForArchitecture:
    HeadNode:
      Graviton: c7g.medium
      GPU: c6a.large
      x86: c6a.large
    LoginNodes:
      Graviton: c7g.large
      GPU: c6a.xlarge
      x86: c6a.xlarge
    ComputeNodes:
      Graviton: c7g.xlarge
      GPU: g5.xlarge
      x86: c6a.xlarge

Resources:

  PclusterVpc:
    Type: AWS::CloudFormation::Stack
    DeletionPolicy: Delete
    UpdateReplacePolicy: Delete
    Properties:
      Parameters:
        VpcCIDR: 10.0.0.0/16
        PublicCIDR: 10.0.0.0/24
        PrivateCIDR: 10.0.16.0/20
        AvailabilityZone: !Ref AvailabilityZoneName
      TemplateURL: !Sub
        - https://aws-hpc-recipes.s3.us-east-1.amazonaws.com/${Version}/recipes/net/hpc_basic/assets/public-private.yaml
        - { Version: !FindInMap [Recipes, Constants, Version] }
      TimeoutInMinutes: 10


  DemoBucket:
    Type: AWS::CloudFormation::Stack
    DeletionPolicy: Delete
    UpdateReplacePolicy: Delete
    Properties:
      TemplateURL: !Sub
        - https://aws-hpc-recipes.s3.us-east-1.amazonaws.com/${Version}/recipes/storage/s3_demo/assets/main.yaml
        - { Version: !FindInMap [Recipes, Constants, Version] }
      TimeoutInMinutes: 10

  FSxFilesystem:
    Type: AWS::CloudFormation::Stack
    DeletionPolicy: Delete
    UpdateReplacePolicy: Delete
    Properties:
      Parameters:
        VpcId: !GetAtt [ PclusterVpc , Outputs.VPC ]
        SubnetId: !GetAtt [ PclusterVpc , Outputs.DefaultPrivateSubnet ]
        Capacity: !Ref FilesystemCapacityGb
        DataRepositoryPath: !Sub
          - 's3://${BucketName}/'
          - BucketName: !GetAtt [ DemoBucket , Outputs.BucketName ]
      TemplateURL: !Sub
        - https://aws-hpc-recipes.s3.us-east-1.amazonaws.com/${Version}/recipes/storage/fsx_lustre/assets/persistent.yaml
        - { Version: !FindInMap [Recipes, Constants, Version] }

  PclusterClusterProvider:
    Type: AWS::CloudFormation::Stack
    DeletionPolicy: Delete
    UpdateReplacePolicy: Delete
    Properties:
      TemplateURL: !Sub
        - https://${AWS::Region}-aws-parallelcluster.s3.${AWS::Region}.${AWS::URLSuffix}/parallelcluster/${Version}/templates/custom_resource/cluster.yaml
        - { Version: !FindInMap [ParallelCluster, Constants, Version] }
      TimeoutInMinutes: 10

  PclusterCluster:
    Type: Custom::PclusterCluster
    DeletionPolicy: Delete
    UpdateReplacePolicy: Delete
    Properties:
      ServiceToken: !GetAtt [ PclusterClusterProvider , Outputs.ServiceToken ]
      ClusterName: !Sub 'c-${AWS::StackName}'
      ClusterConfiguration:
        Image:
          Os: !Ref OS
        HeadNode:
          InstanceType: !FindInMap [ InstanceTypeForArchitecture, HeadNode, !Ref Architecture ]
          Networking:
            SubnetId: !GetAtt [ PclusterVpc, Outputs.DefaultPublicSubnet ]
            AdditionalSecurityGroups:
              - !GetAtt [ FSxFilesystem, Outputs.FSxLustreSecurityGroupId ]
          Dcv:
            Enabled: false
          LocalStorage:
            RootVolume:
              Size: 40
          Iam:
            AdditionalIamPolicies:
              - Policy: arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore
        Scheduling:
          Scheduler: slurm
          SlurmSettings:
            QueueUpdateStrategy: TERMINATE
          SlurmQueues:
          - Name: compute
            ComputeResources:
            - Name: nodes
              InstanceType: !FindInMap [ InstanceTypeForArchitecture, ComputeNodes, !Ref Architecture ]
              MinCount: 0
              MaxCount: !Ref ComputeInstanceMax
              DisableSimultaneousMultithreading: true
            Networking:
              SubnetIds:
                - !GetAtt [ PclusterVpc, Outputs.DefaultPrivateSubnet ]
              PlacementGroup:
                Enabled: true
              AdditionalSecurityGroups:
                - !GetAtt [ FSxFilesystem, Outputs.FSxLustreSecurityGroupId ]
            ComputeSettings:
              LocalStorage:
                RootVolume:
                  Size: 40
        SharedStorage:
        - StorageType: Efs
          Name: shared
          MountDir: /shared
          EfsSettings:
            DeletionPolicy: Delete
        - StorageType: FsxLustre
          Name: fsxlustre
          MountDir: /fsx
          FsxLustreSettings:
            FileSystemId: !GetAtt [ FSxFilesystem, Outputs.FSxLustreFilesystemId ]

Outputs:
  HeadNodeIp:
    Description: EC2 instance ID for the HeadNode
    Value: !GetAtt [ PclusterCluster, headNode.instanceId ]
    Export:
      Name: !Sub ${AWS::StackName}-HeadNodeInstanceId
  SystemManagerUrl:
    Description: URL to access the HeadNode via SystemManager
    Value: !Sub
      - https://${ConsoleDomain}/systems-manager/session-manager/${InstanceId}?region=${AWS::Region}
      - { ConsoleDomain: !If [ GovCloud, 'console.amazonaws-us-gov.com', !If [ China, 'console.amazonaws.cn', !Sub '${AWS::Region}.console.aws.amazon.com']],
          InstanceId: !GetAtt [ PclusterCluster, headNode.instanceId ]
        }
    Export:
      Name: !Sub ${AWS::StackName}-SystemManagerUrl
  ValidationMessages:
    Description: Warnings from cluster create or update operations.
    Value: !GetAtt PclusterCluster.validationMessages