AWSTemplateFormatVersion: 2010-09-09
Description: Companion infrastructure for the Ontop blog
Parameters:
  Env:
    Description: "Environment tag, e.g. prod, nonprod."
    Default: test
    Type: String
    AllowedPattern: "[a-z0-9]+"
    MaxLength: 15
  NeptuneDbInstanceType:
    Description: Neptune DB instance type
    Type: String
    Default: db.serverless
    AllowedValues:
      - db.serverless
      - db.t3.medium
      - db.r5.large
      - db.r5.xlarge
      - db.r5.2xlarge
      - db.r5.4xlarge
      - db.r5.8xlarge
      - db.r6g.large
      - db.r6g.xlarge
      - db.r6g.2xlarge
      - db.r6g.4xlarge
      - db.r6g.8xlarge
  IamAuthEnabled:
    Type: String
    Default: "true"
    AllowedValues:
      - "true"
      - "false"
    Description: Enable IAM Auth for Neptune.
  NotebookInstanceType:
    Description: >-
      SageMaker Notebook instance type. Please refer
      https://aws.amazon.com/sagemaker/pricing/ for uptodate allowed instance
      type in aws region and https://aws.amazon.com/neptune/pricing/ for
      pricing.
    Type: String
    Default: ml.t3.medium
    AllowedValues:
      - ml.t2.medium
      - ml.t2.large
      - ml.t2.xlarge
      - ml.t2.2xlarge
      - ml.t3.medium
      - ml.t3.large
      - ml.t3.xlarge
      - ml.t3.2xlarge
      - ml.m4.xlarge
      - ml.m4.2xlarge
      - ml.m4.4xlarge
      - ml.m4.10xlarge
      - ml.m4.16xlarge
      - ml.m5.xlarge
      - ml.m5.2xlarge
      - ml.m5.4xlarge
      - ml.m5.12xlarge
      - ml.m5.24xlarge
      - ml.c4.large
      - ml.c4.xlarge
      - ml.c4.2xlarge
      - ml.c4.4xlarge
      - ml.c4.8xlarge
      - ml.c5.xlarge
      - ml.c5.2xlarge
      - ml.c5.4xlarge
      - ml.c5.9xlarge
      - ml.c5.18xlarge
      - ml.c5d.xlarge
      - ml.c5d.2xlarge
      - ml.c5d.4xlarge
      - ml.c5d.9xlarge
      - ml.c5d.18xlarge
      - ml.p3.2xlarge
      - ml.p3.8xlarge
      - ml.p3.16xlarge
      - ml.p2.2xlarge
      - ml.p2.8xlarge
      - ml.p2.16xlarge
    ConstraintDescription: Must be a valid SageMaker instance type.
  SourceS3BucketName:
    Description: "Public bucket containing weather data for this demo"
    Default: aws-neptune-customer-samples
    Type: String 
  SourceS3BucketFolderNoSlash:
    Description: "Folder in public bucket containing weather data for this demo"
    Default: neptune-virtualization/blog
    Type: String 

Resources:

  # S3 bucket for weather data, used by lake and for Neptune bulk-load staging
  S3DataBucket:
    Type: AWS::S3::Bucket
    DeletionPolicy: Delete 
    Properties:
        BucketEncryption:
          ServerSideEncryptionConfiguration:
          - ServerSideEncryptionByDefault:
              SSEAlgorithm: 'AES256' 

  # Custom resource lambda that sets key env vars for use in notebook: 
  # PUB_BUCKET, DATA_BUCKET, ROOT_DIR_NOSLASH
  # And it copies the notebook from Git to the notebook instance
  SetupFunction:
    Type: AWS::Lambda::Function
    Properties:
      Role: !GetAtt 'LambdaExecutionRole.Arn'
      FunctionName: !Join ['-', [!Ref 'AWS::StackName','copy' ]]
      MemorySize: 1024
      Runtime: python3.10
      Timeout: 300
      Handler: index.handler
      Code:
        ZipFile: 
          Fn::Sub:
          - |-
           import json
           import boto3
           import urllib3
           import os
           import logging
           import cfnresponse

           LOGGER = logging.getLogger()
           LOGGER.setLevel(logging.INFO)

           PUB_BUCKET = '${SourceS3BucketName}'
           ROOT_DIR_NOSLASH= '${SourceS3BucketFolderNoSlash}'
           DATA_BUCKET='${S3DataBucket}'
           STACKNAME='${StackName}'

           def handler(event, context):
              response_data = {}
              response_data['Data'] = 'git success'
              response_data['NotebookAddScript'] =  f'echo "export PUB_BUCKET={PUB_BUCKET}" >> ~/.bashrc\n'
              response_data['NotebookAddScript'] +=  f'echo "export DATA_BUCKET={DATA_BUCKET}" >> ~/.bashrc\n'
              response_data['NotebookAddScript'] +=  f'echo "export ROOT_DIR_NOSLASH={ROOT_DIR_NOSLASH}" >> ~/.bashrc\n'
              response_data['NotebookAddScript'] +=  f'echo "export STACKNAME={STACKNAME}" >> ~/.bashrc\n'
              response_data['NotebookAddScript'] +=  f"if [ ! -f /home/ec2-user/SageMaker/climate-data-queries.ipynb  ]\n"
              response_data['NotebookAddScript'] += f"then\n"
              response_data['NotebookAddScript'] += f"  cd /home/ec2-user/SageMaker\n"
              response_data['NotebookAddScript'] += f"  wget https://raw.githubusercontent.com/aws-samples/amazon-neptune-graph-virtualization/main/notebook/climate-data-queries.ipynb\n"
              response_data['NotebookAddScript'] += f"fi\n"
              response_data['Data'] = 'git success'
              cfnresponse.send(event, context, cfnresponse.SUCCESS, response_data)
                
              return {
                   'statusCode': 200,
                   'body': json.dumps('Copied Files')
              }

          - {
            SourceS3BucketName: !Ref SourceS3BucketName,
            SourceS3BucketFolderNoSlash: !Ref SourceS3BucketFolderNoSlash,
            S3DataBucket : !Ref S3DataBucket,
            StackName: !Ref AWS::StackName,
            Role : !GetAtt LambdaExecutionRole.Arn
            } 
  
  # Role for the custom resource lambda
  LambdaExecutionRole:
    Type: 'AWS::IAM::Role'
    Properties:
      AssumeRolePolicyDocument:
        Version: "2012-10-17"
        Statement:
          - Effect: Allow
            Principal:
              Service:
                - lambda.amazonaws.com
            Action:
              - 'sts:AssumeRole'
      Path: /
      ManagedPolicyArns:
        - arn:aws:iam::aws:policy/AWSLambdaExecute
  
  # Custom resource to help setup
  EnvSetup:
    Type: 'Custom::EnvSetup'
    DependsOn:
      - LambdaExecutionRole
      - S3DataBucket
    Properties:
      ServiceToken: !GetAtt SetupFunction.Arn
      data_bucket: !Ref S3DataBucket
      pub_bucket: !Ref SourceS3BucketName
      pub_bucket_path: !Ref SourceS3BucketFolderNoSlash

  # Neptune base stack. Also creates VPC, S3 VPC endpoint, bulk load role
  NeptuneCoreStack:
    # Encryption at not, not currently supported by quickstart
    Type: "AWS::CloudFormation::Stack"
    Properties:
      TemplateURL: "https://s3.amazonaws.com/aws-neptune-customer-samples/v2/cloudformation-templates/neptune-base-stack.json"
      TimeoutInMinutes: "60"
      Parameters:
        DBClusterId: !Join [ "-", ["gv", !Join [ "", !Split [ "-", !Select [ 2,  !Split [ "/", !Ref AWS::StackId ] ] ] ] ] ]
        DbInstanceType: !Ref NeptuneDbInstanceType
        Env: !Ref Env
        IamAuthEnabled: !Ref IamAuthEnabled
        NeptuneEnableAuditLog: 1
        StorageEncrypted: true

  NeptuneNotebookStack:
    # Encryption at not, not currently supported by quickstart
    Type: "AWS::CloudFormation::Stack"
    Properties:
      TemplateURL: "https://s3.amazonaws.com/aws-neptune-customer-samples/v2/cloudformation-templates/neptune-sagemaker-notebook-stack.json"
      TimeoutInMinutes: "60"
      Parameters:
        Env: !Ref Env
        NotebookInstanceType: !Ref NotebookInstanceType
        NeptuneClusterVpc: !GetAtt NeptuneCoreStack.Outputs.VPC
        NeptuneClusterSubnetId: !GetAtt NeptuneCoreStack.Outputs.PublicSubnet1
        NeptuneClientSecurityGroup: !GetAtt NeptuneCoreStack.Outputs.NeptuneSG
        NeptuneLoadFromS3RoleArn: !GetAtt NeptuneCoreStack.Outputs.NeptuneLoadFromS3IAMRoleArn
        DBClusterId: !GetAtt NeptuneCoreStack.Outputs.DBClusterId
        NeptuneClusterResourceId: !GetAtt NeptuneCoreStack.Outputs.DBClusterResourceId
        NeptuneClusterEndpoint: !GetAtt NeptuneCoreStack.Outputs.DBClusterEndpoint
        EnableIamAuthOnNeptune: !Ref IamAuthEnabled
        StartupScript: !GetAtt EnvSetup.NotebookAddScript 

  # Define as ECS console does except add 8080
  ECSSecurityGroup:
    Type: "AWS::EC2::SecurityGroup"
    Properties:
      GroupDescription: "Lake traffic"
      VpcId: !GetAtt 
        - NeptuneCoreStack
        - Outputs.VPC
      SecurityGroupEgress:
      - IpProtocol: -1
        CidrIp: "0.0.0.0/0"
      SecurityGroupIngress:
      - IpProtocol: tcp
        FromPort: 8080
        ToPort: 8080
        SourceSecurityGroupId: !GetAtt 
        - NeptuneCoreStack
        - Outputs.NeptuneSG

  # We need an ECS cluster. We will create a task too, but that's post-setup
  # The reader will do that on their own from the notebook
  ECSCluster:
    Type: AWS::ECS::Cluster
    Properties:
      ClusterName: !Ref Env

  # ECR repo. Reader will push image post-setup once they've built it
  ECRLakeRepository: 
    Type: AWS::ECR::Repository
    Properties: 
      RepositoryName: "ontop-graph-weather-lake"

  # The role for the ECS task, once it's ready
  ECSTaskRole:
    Type: AWS::IAM::Role
    Properties:
      AssumeRolePolicyDocument:
        Statement:
          - Effect: Allow
            Principal:
              Service: ecs-tasks.amazonaws.com
            Action: 'sts:AssumeRole'
      Policies:
        - PolicyName: ontop-lake-policy
          PolicyDocument:
            Version: "2012-10-17"
            Statement:
              - Effect: Allow
                Action:
                  - athena:getQueryExecution
                  - athena:BatchGetQueryExecution
                  - athena:GetQueryExecution
                  - athena:GetQueryResults
                  - athena:GetQueryResultsStream
                  - athena:ListQueryExecutions
                  - athena:StartQueryExecution
                  - athena:StopQueryExecution
                  - athena:ListWorkGroups
                  - athena:ListEngineVersions
                  - athena:GetWorkGroup
                  - athena:GetDataCatalog
                  - athena:GetDatabase
                  - athena:GetTableMetadata
                  - athena:ListDataCatalogs
                  - athena:ListDatabases
                  - athena:ListTableMetadata
                Resource: "*"
              - Effect: Allow
                Action: "glue:*" 
                Resource: 
                  - !Sub 'arn:aws:glue:${AWS::Region}:${AWS::AccountId}:catalog'
                  - !Sub 'arn:aws:glue:${AWS::Region}:${AWS::AccountId}:database/ontop_demo_lake'
                  - !Sub 'arn:aws:glue:${AWS::Region}:${AWS::AccountId}:table/ontop_demo_lake/*'
              - Effect: Allow
                Action:
                  - s3:GetBucketLocation
                  - s3:GetObject
                  - s3:ListBucket
                  - s3:ListBucketMultipartUploads
                  - s3:ListMultipartUploadParts
                Resource:
                  - !Sub 'arn:aws:s3:::${S3DataBucket}'
                  - !Sub 'arn:aws:s3:::${S3DataBucket}/*'
              - Effect: Allow
                Action:
                  - s3:GetBucketLocation
                  - s3:GetObject
                  - s3:ListBucket
                  - s3:ListBucketMultipartUploads
                  - s3:AbortMultipartUpload
                  - s3:PutObject
                  - s3:ListMultipartUploadParts
                Resource:
                  - !Sub 'arn:aws:s3:::${S3DataBucket}/results/*'

  # Glue catalog DB
  DemoVirtualizationDatabase:
    Type: "AWS::Glue::Database"
    Properties:
      DatabaseInput:
        Name: ontop_demo_lake
        LocationUri: !Ref S3DataBucket
        Description: "Demo virtualization lake"
      CatalogId: !Ref AWS::AccountId

  CrawlerRole:
    Type: AWS::IAM::Role
    Properties:
      AssumeRolePolicyDocument:
        Version: "2012-10-17"
        Statement:
          -
            Effect: "Allow"
            Principal:
              Service:
                - "glue.amazonaws.com"
            Action:
              - "sts:AssumeRole"
      Path: "/"
      ManagedPolicyArns:
        ['arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole']
      Policies:
        -
          PolicyName: "S3BucketAccessPolicy"
          PolicyDocument:
            Version: "2012-10-17"
            Statement:
              -
                Effect: "Allow"
                Action: 
                  - "s3:GetObject"
                  - "s3:PutObject"
                Resource: 
                  !Join
                    - '/'
                    - - !GetAtt S3DataBucket.Arn
                      - "*"
  ClimateCrawler:
    Type: AWS::Glue::Crawler
    Properties:
      Name: "ClimateCrawler"
      Role: !GetAtt CrawlerRole.Arn
      DatabaseName: !Ref DemoVirtualizationDatabase
      Targets:
        S3Targets:
          - Path: !Sub 's3://${S3DataBucket}/lake/climate/'
      SchemaChangePolicy:
        UpdateBehavior: "UPDATE_IN_DATABASE"
        DeleteBehavior: "LOG"
      Configuration: "{\"Version\":1.0,\"CreatePartitionIndex\":true}"

Outputs:


  DBClusterEndpoint:
    Value: !GetAtt
      - NeptuneCoreStack
      - Outputs.DBClusterEndpoint
  DBClusterId:
    Value: !GetAtt
      - NeptuneCoreStack
      - Outputs.DBClusterId
  DBClusterPort:
    Value: !GetAtt
      - NeptuneCoreStack
      - Outputs.DBClusterPort
  DBClusterResourceId:
    Value: !GetAtt
      - NeptuneCoreStack
      - Outputs.DBClusterResourceId
  NeptuneLoadFromS3IAMRoleArn:
    Value: !GetAtt
      - NeptuneCoreStack
      - Outputs.NeptuneLoadFromS3IAMRoleArn
  PrivateSubnet1:
    Value: !GetAtt
      - NeptuneCoreStack
      - Outputs.PrivateSubnet1
  PrivateSubnet2:
    Value: !GetAtt
      - NeptuneCoreStack
      - Outputs.PrivateSubnet2
  PublicSubnet1:
    Value: !GetAtt
      - NeptuneCoreStack
      - Outputs.PublicSubnet1
  VPC:
    Value: !GetAtt
      - NeptuneCoreStack
      - Outputs.VPC

  NeptuneSagemakerNotebook:
    Value: !GetAtt
      - NeptuneNotebookStack
      - Outputs.NeptuneSagemakerNotebook
  S3DataBucket:
    Value: !Ref S3DataBucket
  ECSSecurityGroup:
    Value: !Ref ECSSecurityGroup
  ECSCluster:
    Value: !Ref ECSCluster
  ECRLakeRepository: 
    Value: !Ref ECRLakeRepository
  ECSTaskRole:
    Value: !Ref ECSTaskRole
  DemoVirtualizationDatabase:
    Value: !Ref DemoVirtualizationDatabase