AWSTemplateFormatVersion: 2010-09-09 Description: Companion infrastructure for the Ontop blog Parameters: Env: Description: "Environment tag, e.g. prod, nonprod." Default: test Type: String AllowedPattern: "[a-z0-9]+" MaxLength: 15 NeptuneDbInstanceType: Description: Neptune DB instance type Type: String Default: db.serverless AllowedValues: - db.serverless - db.t3.medium - db.r5.large - db.r5.xlarge - db.r5.2xlarge - db.r5.4xlarge - db.r5.8xlarge - db.r6g.large - db.r6g.xlarge - db.r6g.2xlarge - db.r6g.4xlarge - db.r6g.8xlarge IamAuthEnabled: Type: String Default: "true" AllowedValues: - "true" - "false" Description: Enable IAM Auth for Neptune. NotebookInstanceType: Description: >- SageMaker Notebook instance type. Please refer https://aws.amazon.com/sagemaker/pricing/ for uptodate allowed instance type in aws region and https://aws.amazon.com/neptune/pricing/ for pricing. Type: String Default: ml.t3.medium AllowedValues: - ml.t2.medium - ml.t2.large - ml.t2.xlarge - ml.t2.2xlarge - ml.t3.medium - ml.t3.large - ml.t3.xlarge - ml.t3.2xlarge - ml.m4.xlarge - ml.m4.2xlarge - ml.m4.4xlarge - ml.m4.10xlarge - ml.m4.16xlarge - ml.m5.xlarge - ml.m5.2xlarge - ml.m5.4xlarge - ml.m5.12xlarge - ml.m5.24xlarge - ml.c4.large - ml.c4.xlarge - ml.c4.2xlarge - ml.c4.4xlarge - ml.c4.8xlarge - ml.c5.xlarge - ml.c5.2xlarge - ml.c5.4xlarge - ml.c5.9xlarge - ml.c5.18xlarge - ml.c5d.xlarge - ml.c5d.2xlarge - ml.c5d.4xlarge - ml.c5d.9xlarge - ml.c5d.18xlarge - ml.p3.2xlarge - ml.p3.8xlarge - ml.p3.16xlarge - ml.p2.2xlarge - ml.p2.8xlarge - ml.p2.16xlarge ConstraintDescription: Must be a valid SageMaker instance type. SourceS3BucketName: Description: "Public bucket containing weather data for this demo" Default: aws-neptune-customer-samples Type: String SourceS3BucketFolderNoSlash: Description: "Folder in public bucket containing weather data for this demo" Default: neptune-virtualization/blog Type: String Cloud9InstanceType: Description: EC2 instance type on which IDE runs Type: String Default: t2.micro AllowedValues: - t2.nano - t2.micro - t2.small - t2.medium - t2.large - t2.xlarge - t2.2xlarge - m3.medium - m3.large - m3.xlarge - m3.2xlarge - m4.large - m4.xlarge - m4.2xlarge - m4.4xlarge - m4.10xlarge - m4.16xlarge - c3.large - c3.xlarge - c3.2xlarge - c3.4xlarge - c3.8xlarge - c4.large - c4.xlarge - c4.2xlarge - c4.4xlarge - c4.8xlarge Resources: # We need an IDE for two purposes: build docker image for Ontop Cloud9IDE: Type: 'AWS::Cloud9::EnvironmentEC2' Properties: Name: OntopCloud9IDE InstanceType: !Ref 'Cloud9InstanceType' ImageId: amazonlinux-2023-x86_64 SubnetId: !GetAtt - NeptuneCoreStack - Outputs.PublicSubnet1 # S3 bucket for weather data, used by lake and for Neptune bulk-load staging S3DataBucket: Type: AWS::S3::Bucket DeletionPolicy: Delete Properties: BucketEncryption: ServerSideEncryptionConfiguration: - ServerSideEncryptionByDefault: SSEAlgorithm: 'AES256' # Custom resource lambda that sets key env vars for use in notebook: # PUB_BUCKET, DATA_BUCKET, ROOT_DIR_NOSLASH # And it copies the notebook from Git to the notebook instance SetupFunction: Type: AWS::Lambda::Function Properties: Role: !GetAtt 'LambdaExecutionRole.Arn' FunctionName: !Join ['-', [!Ref 'AWS::StackName','copy' ]] MemorySize: 1024 Runtime: python3.10 Timeout: 300 Handler: index.handler Code: ZipFile: Fn::Sub: - |- import json import boto3 import urllib3 import os import logging import cfnresponse LOGGER = logging.getLogger() LOGGER.setLevel(logging.INFO) PUB_BUCKET = '${SourceS3BucketName}' ROOT_DIR_NOSLASH= '${SourceS3BucketFolderNoSlash}' DATA_BUCKET='${S3DataBucket}' STACKNAME='${StackName}' def handler(event, context): response_data = {} response_data['Data'] = 'git success' response_data['NotebookAddScript'] = f'echo "export PUB_BUCKET={PUB_BUCKET}" >> ~/.bashrc\n' response_data['NotebookAddScript'] += f'echo "export DATA_BUCKET={DATA_BUCKET}" >> ~/.bashrc\n' response_data['NotebookAddScript'] += f'echo "export ROOT_DIR_NOSLASH={ROOT_DIR_NOSLASH}" >> ~/.bashrc\n' response_data['NotebookAddScript'] += f'echo "export STACKNAME={STACKNAME}" >> ~/.bashrc\n' response_data['NotebookAddScript'] += f"if [ ! -f /home/ec2-user/SageMaker/climate-data-queries.ipynb ]\n" response_data['NotebookAddScript'] += f"then\n" response_data['NotebookAddScript'] += f" cd /home/ec2-user/SageMaker\n" response_data['NotebookAddScript'] += f" wget https://raw.githubusercontent.com/aws-samples/amazon-neptune-graph-virtualization/main/notebook/climate-data-queries.ipynb\n" response_data['NotebookAddScript'] += f"fi\n" response_data['Data'] = 'git success' cfnresponse.send(event, context, cfnresponse.SUCCESS, response_data) return { 'statusCode': 200, 'body': json.dumps('Copied Files') } - { SourceS3BucketName: !Ref SourceS3BucketName, SourceS3BucketFolderNoSlash: !Ref SourceS3BucketFolderNoSlash, S3DataBucket : !Ref S3DataBucket, StackName: !Ref AWS::StackName, Role : !GetAtt LambdaExecutionRole.Arn } # Role for the custom resource lambda LambdaExecutionRole: Type: 'AWS::IAM::Role' Properties: AssumeRolePolicyDocument: Version: "2012-10-17" Statement: - Effect: Allow Principal: Service: - lambda.amazonaws.com Action: - 'sts:AssumeRole' Path: / ManagedPolicyArns: - arn:aws:iam::aws:policy/AWSLambdaExecute # Custom resource to help setup EnvSetup: Type: 'Custom::EnvSetup' DependsOn: - LambdaExecutionRole - S3DataBucket Properties: ServiceToken: !GetAtt SetupFunction.Arn data_bucket: !Ref S3DataBucket pub_bucket: !Ref SourceS3BucketName pub_bucket_path: !Ref SourceS3BucketFolderNoSlash # Neptune base stack. Also creates VPC, S3 VPC endpoint, bulk load role NeptuneCoreStack: # Encryption at not, not currently supported by quickstart Type: "AWS::CloudFormation::Stack" Properties: TemplateURL: "https://s3.amazonaws.com/aws-neptune-customer-samples/v2/cloudformation-templates/neptune-base-stack.json" TimeoutInMinutes: "60" Parameters: DBClusterId: !Join [ "-", ["gv", !Join [ "", !Split [ "-", !Select [ 2, !Split [ "/", !Ref AWS::StackId ] ] ] ] ] ] DbInstanceType: !Ref NeptuneDbInstanceType Env: !Ref Env IamAuthEnabled: !Ref IamAuthEnabled NeptuneEnableAuditLog: 1 StorageEncrypted: true NeptuneNotebookStack: # Encryption at not, not currently supported by quickstart Type: "AWS::CloudFormation::Stack" Properties: TemplateURL: "https://s3.amazonaws.com/aws-neptune-customer-samples/v2/cloudformation-templates/neptune-sagemaker-notebook-stack.json" TimeoutInMinutes: "60" Parameters: Env: !Ref Env NotebookInstanceType: !Ref NotebookInstanceType NeptuneClusterVpc: !GetAtt NeptuneCoreStack.Outputs.VPC NeptuneClusterSubnetId: !GetAtt NeptuneCoreStack.Outputs.PublicSubnet1 NeptuneClientSecurityGroup: !GetAtt NeptuneCoreStack.Outputs.NeptuneSG NeptuneLoadFromS3RoleArn: !GetAtt NeptuneCoreStack.Outputs.NeptuneLoadFromS3IAMRoleArn DBClusterId: !GetAtt NeptuneCoreStack.Outputs.DBClusterId NeptuneClusterResourceId: !GetAtt NeptuneCoreStack.Outputs.DBClusterResourceId NeptuneClusterEndpoint: !GetAtt NeptuneCoreStack.Outputs.DBClusterEndpoint EnableIamAuthOnNeptune: !Ref IamAuthEnabled StartupScript: !GetAtt EnvSetup.NotebookAddScript # Define as ECS console does except add 8080 ECSSecurityGroup: Type: "AWS::EC2::SecurityGroup" Properties: GroupDescription: "Lake traffic" VpcId: !GetAtt - NeptuneCoreStack - Outputs.VPC SecurityGroupEgress: - IpProtocol: -1 CidrIp: "0.0.0.0/0" SecurityGroupIngress: - IpProtocol: tcp FromPort: 8080 ToPort: 8080 SourceSecurityGroupId: !GetAtt - NeptuneCoreStack - Outputs.NeptuneSG # We need an ECS cluster. We will create a task too, but that's post-setup # The reader will do that on their own from the notebook ECSCluster: Type: AWS::ECS::Cluster Properties: ClusterName: !Ref Env # ECR repo. Reader will push image post-setup once they've built it ECRLakeRepository: Type: AWS::ECR::Repository Properties: RepositoryName: "ontop-graph-weather-lake" # The role for the ECS task, once it's ready ECSTaskRole: Type: AWS::IAM::Role Properties: AssumeRolePolicyDocument: Statement: - Effect: Allow Principal: Service: ecs-tasks.amazonaws.com Action: 'sts:AssumeRole' Policies: - PolicyName: ontop-lake-policy PolicyDocument: Version: "2012-10-17" Statement: - Effect: Allow Action: - athena:getQueryExecution - athena:BatchGetQueryExecution - athena:GetQueryExecution - athena:GetQueryResults - athena:GetQueryResultsStream - athena:ListQueryExecutions - athena:StartQueryExecution - athena:StopQueryExecution - athena:ListWorkGroups - athena:ListEngineVersions - athena:GetWorkGroup - athena:GetDataCatalog - athena:GetDatabase - athena:GetTableMetadata - athena:ListDataCatalogs - athena:ListDatabases - athena:ListTableMetadata Resource: "*" - Effect: Allow Action: "glue:*" Resource: - !Sub 'arn:aws:glue:${AWS::Region}:${AWS::AccountId}:catalog' - !Sub 'arn:aws:glue:${AWS::Region}:${AWS::AccountId}:database/ontop_demo_lake' - !Sub 'arn:aws:glue:${AWS::Region}:${AWS::AccountId}:table/ontop_demo_lake/*' - Effect: Allow Action: - s3:GetBucketLocation - s3:GetObject - s3:ListBucket - s3:ListBucketMultipartUploads - s3:ListMultipartUploadParts Resource: - !Sub 'arn:aws:s3:::${S3DataBucket}' - !Sub 'arn:aws:s3:::${S3DataBucket}/*' - Effect: Allow Action: - s3:GetBucketLocation - s3:GetObject - s3:ListBucket - s3:ListBucketMultipartUploads - s3:AbortMultipartUpload - s3:PutObject - s3:ListMultipartUploadParts Resource: - !Sub 'arn:aws:s3:::${S3DataBucket}/results/*' # Glue catalog DB DemoVirtualizationDatabase: Type: "AWS::Glue::Database" Properties: DatabaseInput: Name: ontop_demo_lake LocationUri: !Ref S3DataBucket Description: "Demo virtualization lake" CatalogId: !Ref AWS::AccountId CrawlerRole: Type: AWS::IAM::Role Properties: AssumeRolePolicyDocument: Version: "2012-10-17" Statement: - Effect: "Allow" Principal: Service: - "glue.amazonaws.com" Action: - "sts:AssumeRole" Path: "/" ManagedPolicyArns: ['arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole'] Policies: - PolicyName: "S3BucketAccessPolicy" PolicyDocument: Version: "2012-10-17" Statement: - Effect: "Allow" Action: - "s3:GetObject" - "s3:PutObject" Resource: !Join - '/' - - !GetAtt S3DataBucket.Arn - "*" ClimateCrawler: Type: AWS::Glue::Crawler Properties: Name: "ClimateCrawler" Role: !GetAtt CrawlerRole.Arn DatabaseName: !Ref DemoVirtualizationDatabase Targets: S3Targets: - Path: !Sub 's3://${S3DataBucket}/lake/climate/' SchemaChangePolicy: UpdateBehavior: "UPDATE_IN_DATABASE" DeleteBehavior: "LOG" Configuration: "{\"Version\":1.0,\"CreatePartitionIndex\":true}" Outputs: DBClusterEndpoint: Value: !GetAtt - NeptuneCoreStack - Outputs.DBClusterEndpoint DBClusterId: Value: !GetAtt - NeptuneCoreStack - Outputs.DBClusterId DBClusterPort: Value: !GetAtt - NeptuneCoreStack - Outputs.DBClusterPort DBClusterResourceId: Value: !GetAtt - NeptuneCoreStack - Outputs.DBClusterResourceId NeptuneLoadFromS3IAMRoleArn: Value: !GetAtt - NeptuneCoreStack - Outputs.NeptuneLoadFromS3IAMRoleArn PrivateSubnet1: Value: !GetAtt - NeptuneCoreStack - Outputs.PrivateSubnet1 PrivateSubnet2: Value: !GetAtt - NeptuneCoreStack - Outputs.PrivateSubnet2 PublicSubnet1: Value: !GetAtt - NeptuneCoreStack - Outputs.PublicSubnet1 VPC: Value: !GetAtt - NeptuneCoreStack - Outputs.VPC NeptuneSagemakerNotebook: Value: !GetAtt - NeptuneNotebookStack - Outputs.NeptuneSagemakerNotebook S3DataBucket: Value: !Ref S3DataBucket ECSSecurityGroup: Value: !Ref ECSSecurityGroup ECSCluster: Value: !Ref ECSCluster ECRLakeRepository: Value: !Ref ECRLakeRepository ECSTaskRole: Value: !Ref ECSTaskRole DemoVirtualizationDatabase: Value: !Ref DemoVirtualizationDatabase Cloud9IDE: Value: !Ref Cloud9IDE