Description: Create a GPU AMI for Batch Parameters: ImageId: Description: Image ID for base EC2 instance. Type: AWS::EC2::Image::Id # Deep learning AMI as the base Default: ami-9109beee InstanceType: Description: Instance type to launch EC2 instances. Type: String Default: g3.4xlarge Resources: # Completes when the instance is fully provisioned and ready for AMI creation. AMICreate: Type: AWS::CloudFormation::WaitCondition CreationPolicy: ResourceSignal: Timeout: PT1H Instance: Type: AWS::EC2::Instance Properties: ImageId: !Ref ImageId InstanceType: !Ref InstanceType UserData: "Fn::Base64": !Sub | #!/bin/bash -x sudo yum install -y ecs-init sudo service docker start wget https://github.com/NVIDIA/nvidia-docker/releases/download/v1.0.1/nvidia-docker-1.0.1-1.x86_64.rpm sudo rpm -ivh --nodeps nvidia-docker-1.0.1-1.x86_64.rpm # Validate installation rpm -ql nvidia-docker rm nvidia-docker-1.0.1-1.x86_64.rpm # Make sure the NVIDIA kernel modules and driver files are bootstraped # Otherwise running a GPU job inside a container will fail with "cuda: unknown exception" echo '#!/bin/bash' | sudo tee /var/lib/cloud/scripts/per-boot/00_nvidia-modprobe > /dev/null echo 'nvidia-modprobe -u -c=0' | sudo tee --append /var/lib/cloud/scripts/per-boot/00_nvidia-modprobe > /dev/null sudo chmod +x /var/lib/cloud/scripts/per-boot/00_nvidia-modprobe sudo /var/lib/cloud/scripts/per-boot/00_nvidia-modprobe # Start the nvidia-docker-plugin and run a container with # nvidia-docker (retry up to 4 times if it fails initially) sudo -b nohup nvidia-docker-plugin > /tmp/nvidia-docker.log sudo docker pull nvidia/cuda:9.0-cudnn7-devel COMMAND="sudo nvidia-docker run nvidia/cuda:9.0-cudnn7-devel nvidia-smi" for i in {1..5}; do $COMMAND && break || sleep 15; done # Create symlink to latest nvidia-driver version nvidia_base=/var/lib/nvidia-docker/volumes/nvidia_driver sudo ln -s $nvidia_base/$(ls $nvidia_base | sort -n | tail -1) $nvidia_base/latest /opt/aws/bin/cfn-signal \ -e $? \ --stack ${AWS::StackName} \ --region ${AWS::Region} \ --resource AMICreate shutdown -h now AMI: Type: Custom::AMI DependsOn: AMICreate Properties: ServiceToken: !GetAtt AMIFunction.Arn InstanceId: !Ref Instance AMIFunction: Type: AWS::Lambda::Function Properties: Handler: index.handler Role: !GetAtt LambdaExecutionRole.Arn Code: ZipFile: !Sub | var response = require('cfn-response'); var AWS = require('aws-sdk'); exports.handler = function(event, context) { console.log("Request received:\n", JSON.stringify(event)); var physicalId = event.PhysicalResourceId; function success(data) { return response.send(event, context, response.SUCCESS, data, physicalId); } function failed(e) { return response.send(event, context, response.FAILED, e, physicalId); } // Call ec2.waitFor, continuing if not finished before Lambda function timeout. function wait(waiter) { console.log("Waiting: ", JSON.stringify(waiter)); event.waiter = waiter; event.PhysicalResourceId = physicalId; var request = ec2.waitFor(waiter.state, waiter.params); setTimeout(()=>{ request.abort(); console.log("Timeout reached, continuing function. Params:\n", JSON.stringify(event)); var lambda = new AWS.Lambda(); lambda.invoke({ FunctionName: context.invokedFunctionArn, InvocationType: 'Event', Payload: JSON.stringify(event) }).promise().then((data)=>context.done()).catch((err)=>context.fail(err)); }, context.getRemainingTimeInMillis() - 5000); return request.promise().catch((err)=> (err.code == 'RequestAbortedError') ? new Promise(()=>context.done()) : Promise.reject(err) ); } var ec2 = new AWS.EC2(), instanceId = event.ResourceProperties.InstanceId; if (event.waiter) { wait(event.waiter).then((data)=>success({})).catch((err)=>failed(err)); } else if (event.RequestType == 'Create' || event.RequestType == 'Update') { if (!instanceId) { failed('InstanceID required'); } ec2.waitFor('instanceStopped', {InstanceIds: [instanceId]}).promise() .then((data)=> ec2.createImage({ InstanceId: instanceId, Name: event.RequestId }).promise() ).then((data)=> wait({ state: 'imageAvailable', params: {ImageIds: [physicalId = data.ImageId]} }) ).then((data)=>success({})).catch((err)=>failed(err)); } else if (event.RequestType == 'Delete') { if (physicalId.indexOf('ami-') !== 0) { return success({});} ec2.describeImages({ImageIds: [physicalId]}).promise() .then((data)=> (data.Images.length == 0) ? success({}) : ec2.deregisterImage({ImageId: physicalId}).promise() ).then((data)=> ec2.describeSnapshots({Filters: [{ Name: 'description', Values: ["*" + physicalId + "*"] }]}).promise() ).then((data)=> (data.Snapshots.length === 0) ? success({}) : ec2.deleteSnapshot({SnapshotId: data.Snapshots[0].SnapshotId}).promise() ).then((data)=>success({})).catch((err)=>failed(err)); } }; Runtime: nodejs4.3 Timeout: 300 LambdaExecutionRole: Type: AWS::IAM::Role Properties: AssumeRolePolicyDocument: Version: '2012-10-17' Statement: - Effect: Allow Principal: {Service: [lambda.amazonaws.com]} Action: ['sts:AssumeRole'] Path: / ManagedPolicyArns: - arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole - arn:aws:iam::aws:policy/service-role/AWSLambdaRole Policies: - PolicyName: EC2Policy PolicyDocument: Version: '2012-10-17' Statement: - Effect: Allow Action: - 'ec2:DescribeInstances' - 'ec2:DescribeImages' - 'ec2:CreateImage' - 'ec2:DeregisterImage' - 'ec2:DescribeSnapshots' - 'ec2:DeleteSnapshot' Resource: ['*'] Outputs: AMI: Value: !Ref AMI