{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "6c4b1153-be69-4550-a9f7-c9d28a88032b",
   "metadata": {},
   "source": [
    "# Load and connect data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "9edb371d-8c6c-44c3-a31b-615ece69d65c",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting mysql-connector-python\n",
      "  Downloading mysql_connector_python-8.0.29-py2.py3-none-any.whl (342 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m342.0/342.0 kB\u001b[0m \u001b[31m2.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hCollecting protobuf>=3.0.0\n",
      "  Downloading protobuf-4.21.1-cp37-abi3-manylinux2014_aarch64.whl (403 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m403.3/403.3 kB\u001b[0m \u001b[31m7.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hInstalling collected packages: protobuf, mysql-connector-python\n",
      "Successfully installed mysql-connector-python-8.0.29 protobuf-4.21.1\n",
      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
      "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n"
     ]
    }
   ],
   "source": [
    "%pip install mysql-connector-python"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "id": "63d0c243-0ee6-4266-ab6e-9f273b3a5dd7",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import mysql.connector\n",
    "import os\n",
    "import pandas as pd\n",
    "\n",
    "\n",
    "def query(sql):\n",
    "    mydb = mysql.connector.connect(\n",
    "      host='mage-development.cxj4djmtpwkx.us-west-2.rds.amazonaws.com',\n",
    "      user='root',\n",
    "      password=os.getenv('DB_PASSWORD'),\n",
    "      database='materia_development',\n",
    "    )\n",
    "    mycursor = mydb.cursor()\n",
    "\n",
    "    mycursor.execute(sql)\n",
    "\n",
    "    return mycursor.fetchall()\n",
    "\n",
    "\n",
    "def query_table(table_name):\n",
    "    columns = [r[0] for r in query(f'DESCRIBE {table_name}')]\n",
    "    rows = query(f'SELECT * FROM {table_name}')\n",
    "    \n",
    "    return pd.DataFrame(rows, columns=columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "id": "ce511f1c-5d1f-47ad-805a-a18f16b80aa5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'auth_group, auth_group_permissions, auth_permission, auth_user, auth_user_groups, auth_user_user_permissions, authtoken_token, corsheaders_corsmodel, data_cleaning_pipeline, data_management_dataconnector, data_management_datadestination, data_management_datasource, data_management_entity, data_management_feature, data_management_featureset, data_management_featuresetsource, data_management_featuresetsourceoperation, data_management_featuresetversion, data_management_featuresetversionjoin, data_management_transformeraction, django_admin_log, django_content_type, django_migrations, django_session, django_site, experimentation_experiment, inference_predictionoutcome, inference_predictionresult, lifecycle_management_trainingfeature, lifecycle_management_trainingfeatureset, lifecycle_management_trainingset, model_with_versions, model_with_versions_from_staff, oauth2_provider_accesstoken, oauth2_provider_application, oauth2_provider_grant, oauth2_provider_refreshtoken, prototyping_custompredictionvalue, prototyping_model, prototyping_modelcollection, prototyping_modelversion, prototyping_trainingrun, suggestion_management_suggestion, suggestion_suggestion, users_group, users_groupapplication, users_groupmembership, users_invitation, users_userprofile'"
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "', '.join([r[0] for r in query('SHOW TABLES')])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "id": "c0073435-1179-4bb3-92e0-9d04937963dc",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<FeatureSet 35> Cleaning suggestions:\n",
       "1. Remove columns with no values(['partition_filter_version']): Remove columns with no values to increase data quality.\n",
       "2. Remove columns with high empty rate(['deleted_at_feature', 'window_feature', 'transformer_action_id_feature', 'deleted_at_version', 'parent_feature_set_version_id_version', 'reference_feature_set_version_id_version', 'insights_analyzed_at_version', 'deleted_at_feature_set', 'description_feature_set', 'source_type_feature_set']): Remove columns with many missing values may increase data quality.\n",
       "3. Remove columns with single value(['structure_type_version']): Remove columns with a single unique value to reduce the amount of redundant data.\n",
       "4. Fix syntax errors(['username_user']): Fix syntactical errors to reduce the amount of noise in the data.\n",
       "5. Reformat values(['username_user', 'email_user', 'uuid_feature', 'name_feature', 'description_feature', 'name_feature_set']): Format entries in these columns as fully lowercase to improve data quality.\n",
       "6. Fill in missing values(['deleted_at_feature', 'window_feature', 'transformed_feature_id_feature', 'transformer_action_id_feature', 'source_feature_id_feature', 'deleted_at_version', 'entity_feature_id_version', 'timestamp_feature_id_version', 'statistics_calculated_at_version', 'parent_feature_set_version_id_version', 'reference_feature_set_version_id_version', 'model_version_id_version', 'partition_filter_version', 'insights_analyzed_at_version', 'deleted_at_feature_set', 'description_feature_set', 'model_id_feature_set', 'source_type_feature_set']): Fill missing values with a placeholder to mark them as missing.\n",
       "7. Fill in missing values(['user_id_feature', 'user_id_version']): For each column, fill missing entries with the median value.\n",
       "8. Fill in missing values(['description_feature', 'operation_feature']): Fill missing entries using the previously occurring entry in the timeseries.\n",
       "9. Remove outliers(['id_user']): Remove 633 outlier(s) to reduce the amount of noise in this column.\n",
       "10. Remove outliers(['user_id_feature']): Remove 341 outlier(s) to reduce the amount of noise in this column.\n",
       "11. Remove outliers(['transformer_action_id_feature']): Remove 6 outlier(s) to reduce the amount of noise in this column.\n",
       "12. Remove outliers(['source_feature_id_feature']): Remove 28 outlier(s) to reduce the amount of noise in this column.\n",
       "13. Remove outliers(['entity_feature_id_version']): Remove 428 outlier(s) to reduce the amount of noise in this column.\n",
       "14. Remove outliers(['timestamp_feature_id_version']): Remove 313 outlier(s) to reduce the amount of noise in this column.\n",
       "15. Remove outliers(['user_id_version']): Remove 633 outlier(s) to reduce the amount of noise in this column.\n",
       "16. Remove outliers(['user_id_feature_set']): Remove 633 outlier(s) to reduce the amount of noise in this column."
      ]
     },
     "execution_count": 78,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import mage_ai\n",
    "\n",
    "# df = query_table('materia_development.model_with_versions')\n",
    "# mage_ai.connect_data(df, 'model_with_versions')\n",
    "\n",
    "df = query_table('materia_development.features_with_feature_sets')\n",
    "mage_ai.connect_data(df, 'features_with_feature_sets')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "189a7c13-3fc8-4958-95ba-cb80e3ff70d9",
   "metadata": {},
   "source": [
    "# Clean data using pipeline created from app"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "39b50ed0-39c5-4f11-a429-8f3be5ca0940",
   "metadata": {},
   "outputs": [],
   "source": [
    "mage_ai.clean(df, pipeline_uuid=1)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}