{ "cells": [ { "cell_type": "markdown", "id": "6c4b1153-be69-4550-a9f7-c9d28a88032b", "metadata": {}, "source": [ "# Load and connect data" ] }, { "cell_type": "code", "execution_count": 4, "id": "9edb371d-8c6c-44c3-a31b-615ece69d65c", "metadata": { "collapsed": true, "jupyter": { "outputs_hidden": true }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting mysql-connector-python\n", " Downloading mysql_connector_python-8.0.29-py2.py3-none-any.whl (342 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m342.0/342.0 kB\u001b[0m \u001b[31m2.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hCollecting protobuf>=3.0.0\n", " Downloading protobuf-4.21.1-cp37-abi3-manylinux2014_aarch64.whl (403 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m403.3/403.3 kB\u001b[0m \u001b[31m7.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m\n", "\u001b[?25hInstalling collected packages: protobuf, mysql-connector-python\n", "Successfully installed mysql-connector-python-8.0.29 protobuf-4.21.1\n", "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n" ] } ], "source": [ "%pip install mysql-connector-python" ] }, { "cell_type": "code", "execution_count": 70, "id": "63d0c243-0ee6-4266-ab6e-9f273b3a5dd7", "metadata": { "tags": [] }, "outputs": [], "source": [ "import mysql.connector\n", "import os\n", "import pandas as pd\n", "\n", "\n", "def query(sql):\n", " mydb = mysql.connector.connect(\n", " host='mage-development.cxj4djmtpwkx.us-west-2.rds.amazonaws.com',\n", " user='root',\n", " password=os.getenv('DB_PASSWORD'),\n", " database='materia_development',\n", " )\n", " mycursor = mydb.cursor()\n", "\n", " mycursor.execute(sql)\n", "\n", " return mycursor.fetchall()\n", "\n", "\n", "def query_table(table_name):\n", " columns = [r[0] for r in query(f'DESCRIBE {table_name}')]\n", " rows = query(f'SELECT * FROM {table_name}')\n", " \n", " return pd.DataFrame(rows, columns=columns)" ] }, { "cell_type": "code", "execution_count": 71, "id": "ce511f1c-5d1f-47ad-805a-a18f16b80aa5", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'auth_group, auth_group_permissions, auth_permission, auth_user, auth_user_groups, auth_user_user_permissions, authtoken_token, corsheaders_corsmodel, data_cleaning_pipeline, data_management_dataconnector, data_management_datadestination, data_management_datasource, data_management_entity, data_management_feature, data_management_featureset, data_management_featuresetsource, data_management_featuresetsourceoperation, data_management_featuresetversion, data_management_featuresetversionjoin, data_management_transformeraction, django_admin_log, django_content_type, django_migrations, django_session, django_site, experimentation_experiment, inference_predictionoutcome, inference_predictionresult, lifecycle_management_trainingfeature, lifecycle_management_trainingfeatureset, lifecycle_management_trainingset, model_with_versions, model_with_versions_from_staff, oauth2_provider_accesstoken, oauth2_provider_application, oauth2_provider_grant, oauth2_provider_refreshtoken, prototyping_custompredictionvalue, prototyping_model, prototyping_modelcollection, prototyping_modelversion, prototyping_trainingrun, suggestion_management_suggestion, suggestion_suggestion, users_group, users_groupapplication, users_groupmembership, users_invitation, users_userprofile'" ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "', '.join([r[0] for r in query('SHOW TABLES')])" ] }, { "cell_type": "code", "execution_count": 78, "id": "c0073435-1179-4bb3-92e0-9d04937963dc", "metadata": {}, "outputs": [ { "data": { "text/plain": [ " Cleaning suggestions:\n", "1. Remove columns with no values(['partition_filter_version']): Remove columns with no values to increase data quality.\n", "2. Remove columns with high empty rate(['deleted_at_feature', 'window_feature', 'transformer_action_id_feature', 'deleted_at_version', 'parent_feature_set_version_id_version', 'reference_feature_set_version_id_version', 'insights_analyzed_at_version', 'deleted_at_feature_set', 'description_feature_set', 'source_type_feature_set']): Remove columns with many missing values may increase data quality.\n", "3. Remove columns with single value(['structure_type_version']): Remove columns with a single unique value to reduce the amount of redundant data.\n", "4. Fix syntax errors(['username_user']): Fix syntactical errors to reduce the amount of noise in the data.\n", "5. Reformat values(['username_user', 'email_user', 'uuid_feature', 'name_feature', 'description_feature', 'name_feature_set']): Format entries in these columns as fully lowercase to improve data quality.\n", "6. Fill in missing values(['deleted_at_feature', 'window_feature', 'transformed_feature_id_feature', 'transformer_action_id_feature', 'source_feature_id_feature', 'deleted_at_version', 'entity_feature_id_version', 'timestamp_feature_id_version', 'statistics_calculated_at_version', 'parent_feature_set_version_id_version', 'reference_feature_set_version_id_version', 'model_version_id_version', 'partition_filter_version', 'insights_analyzed_at_version', 'deleted_at_feature_set', 'description_feature_set', 'model_id_feature_set', 'source_type_feature_set']): Fill missing values with a placeholder to mark them as missing.\n", "7. Fill in missing values(['user_id_feature', 'user_id_version']): For each column, fill missing entries with the median value.\n", "8. Fill in missing values(['description_feature', 'operation_feature']): Fill missing entries using the previously occurring entry in the timeseries.\n", "9. Remove outliers(['id_user']): Remove 633 outlier(s) to reduce the amount of noise in this column.\n", "10. Remove outliers(['user_id_feature']): Remove 341 outlier(s) to reduce the amount of noise in this column.\n", "11. Remove outliers(['transformer_action_id_feature']): Remove 6 outlier(s) to reduce the amount of noise in this column.\n", "12. Remove outliers(['source_feature_id_feature']): Remove 28 outlier(s) to reduce the amount of noise in this column.\n", "13. Remove outliers(['entity_feature_id_version']): Remove 428 outlier(s) to reduce the amount of noise in this column.\n", "14. Remove outliers(['timestamp_feature_id_version']): Remove 313 outlier(s) to reduce the amount of noise in this column.\n", "15. Remove outliers(['user_id_version']): Remove 633 outlier(s) to reduce the amount of noise in this column.\n", "16. Remove outliers(['user_id_feature_set']): Remove 633 outlier(s) to reduce the amount of noise in this column." ] }, "execution_count": 78, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import mage_ai\n", "\n", "# df = query_table('materia_development.model_with_versions')\n", "# mage_ai.connect_data(df, 'model_with_versions')\n", "\n", "df = query_table('materia_development.features_with_feature_sets')\n", "mage_ai.connect_data(df, 'features_with_feature_sets')" ] }, { "cell_type": "markdown", "id": "189a7c13-3fc8-4958-95ba-cb80e3ff70d9", "metadata": {}, "source": [ "# Clean data using pipeline created from app" ] }, { "cell_type": "code", "execution_count": null, "id": "39b50ed0-39c5-4f11-a429-8f3be5ca0940", "metadata": {}, "outputs": [], "source": [ "mage_ai.clean(df, pipeline_uuid=1)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.5" } }, "nbformat": 4, "nbformat_minor": 5 }