#!/usr/bin/env python3

###################################
#
#  Download data into PowerBI through SQL
#  ======================================
#
#  BEFORE YOU RUN THIS:
#
#      1. Install boto3, pandas, and numpy.
#
#         - Open the windows command prompt (cmd)
#         - Run "conda activate azureml_py36_automl"
#         - Run "conda install pandas numpy boto3"
#
#      2. Set PowerBI's Python to azureml_py36_automl .
#
#         - Go to File -> Options and settings -> Options -> Python scripting
#         - Set a Python home directory to "C:\Miniconda\envs\azureml_py36_automl"
#
#
#      2.5. (Optional) Test your script in Python
#
#           - Copy this file to your Desktop and adjust it
#             to use your ACCESS_KEY and SECRET_KEY, and to
#             access your data/make your queries. Make sure
#             that your queries are returning what you want.
#
#      3. Run this script inside PowerBI
#
#         - Click "Get Data" and select "Python script"
#         - Copy paste your version of this script into the text box.
#         - Edit your script to at least include your ACCESS_KEY and SECRET_KEY.
#
#
#
#      NOTE: To get your ACCESS_KEY and SECRET_KEY, go here for instructions.
#
#            https://statcan.github.io/aaw/en/Storage/#programmatic-access
#
#            Do `ls --ignore='*.json' /vault/secrets` to check MinIO instances, then pick an instance
#            Then `source /vault/secrets/{instance}` and run
#
#                  $ echo $MINIO_ACCESS_KEY
#                  $ echo $MINIO_SECRET_KEY
#
#
#  edited: May 19, 2020
#
###################################

import boto3
from os import remove
import tempfile
import pandas as pd
import sys

#########################
#        CONFIG         #
#########################

TENANT = 'minio' # options: minio or minio-premium

# mandatory. Must get your credentials from Jupyter.
ACCESS_KEY = 'XXXXXXXX'
SECRET_KEY = 'YYYYYYYY'

#########################
#   Ignore this stuff   #
#  Skip to "your file"  #
#########################

def get_minio_client():
    """ create Minio Client. """
    # TODO: Accessing this from within the datacenter would be better.
    URL = f'https://{TENANT}.aaw.cloud.statcan.ca'  
    SECURE = URL.startswith('https')
    return boto3.client('s3',
                  endpoint_url=URL,
                  aws_access_key_id=ACCESS_KEY,
                  aws_secret_access_key=SECRET_KEY,
                  use_ssl=SECURE,
                  region_name="us-west-1")

def __from_s3__(table_type):
    def get_from_json(r):
        """
        Read the response block by block as JSON,
        write to disk to keep memory from exploding.
        """
        temp = tempfile.NamedTemporaryFile(delete=False)
        for event in r['Payload']:
            if 'Records' in event:
                temp.write(event['Records']['Payload'])
        temp.close()
        resp = table_type(temp.name)
        try:
            remove(temp.name)
        except:
            print(f"""There was an error removing the file:\n{temp.name}\n... Proceeding anyway.""", file=sys.stderr)

        return resp
    return get_from_json

@__from_s3__
def pandas_from_json(r):
    """ Read the stream from s3 and turn JSON into a pandas Dataframe. """
    df = pd.read_json(r, lines=True)
    print("Created Dataframe with dimensions: (nrow, ncol) = %s" % str(df.shape), file=sys.stderr)
    return df

s3 = get_minio_client()

############################################################
############################################################
#                                           ~ edit below ~ #
#       YOUR FILE: Tweak the path,                         #
#                        the extension,                    #
#                        the query,                        #
#                    and the compression                   #
#                                                          #
############################################################
############################################################

BUCKET = 'shared'
r = s3.select_object_content(
    Bucket=BUCKET,
    Key='/blair-drummond/sql-example/TotalPopulation.csv.gz',
    ExpressionType='SQL',
    # Note, there's no ';' at the end.
    Expression="""
    SELECT PopTotal,PopDensity FROM s3object s
    WHERE s.Location like '%Canada%'
    """,
    InputSerialization={
        'CSV': {
            # Use this if your CSV file has a header. Else set to "NONE".
            "FileHeaderInfo": "USE",
            'RecordDelimiter': '\n',
            'FieldDelimiter': ',',
            ## There are more formatting options available, if they're needed.
        },
        # Remove this if the file isn't compressed.
        'CompressionType': 'GZIP',
    },
    # JSON is easier to work with than csv, unless you
    # have a massive amount of data.
    OutputSerialization={'JSON': {}},
    # OutputSerialization={'CSV': {'RecordDelimiter': '\n', 'FieldDelimiter': ','}},

)

# Using the CSV format is WAY faster, but you lose the column names!
# You can edit this.

df = pandas_from_json(r)
# df = pandas_from_csv(r)

print(df.head())