{"cells":[{"cell_type":"code","source":"!pip install funpymodeling","metadata":{"id":"1cOuqhGeEyGx","colab":{"height":567,"base_uri":"https://localhost:8080/"},"cell_id":"8dabf94ce1f245fcad7e9cd711978767","outputId":"8be53251-6d83-40ab-c807-3a1a5e53dcad","executionInfo":{"user":{"userId":"04741209928239412574","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Gi4e7mWJaOA2l-1KUn-omyigRGSrm83lG6XLzS5=s64","displayName":"david francisco bustos usta"},"status":"ok","elapsed":4145,"user_tz":300,"timestamp":1633045152171},"deepnote_cell_type":"code"},"outputs":[{"output_type":"stream","name":"stdout","text":"Collecting funpymodeling\n Downloading funpymodeling-0.1.7-py3-none-any.whl (6.4 kB)\nRequirement already satisfied: typing-extensions<4.0.0,>=3.7.4 in /usr/local/lib/python3.7/dist-packages (from funpymodeling) (3.7.4.3)\nRequirement already satisfied: numpy<2.0.0,>=1.18.5 in /usr/local/lib/python3.7/dist-packages (from funpymodeling) (1.19.5)\nRequirement already satisfied: matplotlib<4.0.0,>=3.2.2 in /usr/local/lib/python3.7/dist-packages (from funpymodeling) (3.2.2)\nRequirement already satisfied: pandas<2.0.0,>=1.0.5 in /usr/local/lib/python3.7/dist-packages (from funpymodeling) (1.1.5)\nCollecting seaborn<0.11.0,>=0.10.1\n Downloading seaborn-0.10.1-py3-none-any.whl (215 kB)\n\u001b[K |████████████████████████████████| 215 kB 7.0 MB/s \n\u001b[?25hRequirement already satisfied: sklearn<0.1,>=0.0 in /usr/local/lib/python3.7/dist-packages (from funpymodeling) (0.0)\nRequirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib<4.0.0,>=3.2.2->funpymodeling) (2.4.7)\nRequirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib<4.0.0,>=3.2.2->funpymodeling) (1.3.2)\nRequirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.7/dist-packages (from matplotlib<4.0.0,>=3.2.2->funpymodeling) (0.10.0)\nRequirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib<4.0.0,>=3.2.2->funpymodeling) (2.8.2)\nRequirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from cycler>=0.10->matplotlib<4.0.0,>=3.2.2->funpymodeling) (1.15.0)\nRequirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas<2.0.0,>=1.0.5->funpymodeling) (2018.9)\nRequirement already satisfied: scipy>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from seaborn<0.11.0,>=0.10.1->funpymodeling) (1.4.1)\nRequirement already satisfied: scikit-learn in /usr/local/lib/python3.7/dist-packages (from sklearn<0.1,>=0.0->funpymodeling) (0.22.2.post1)\nRequirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.7/dist-packages (from scikit-learn->sklearn<0.1,>=0.0->funpymodeling) (1.0.1)\nInstalling collected packages: seaborn, funpymodeling\n Attempting uninstall: seaborn\n Found existing installation: seaborn 0.11.2\n Uninstalling seaborn-0.11.2:\n Successfully uninstalled seaborn-0.11.2\nSuccessfully installed funpymodeling-0.1.7 seaborn-0.10.1\n"},{"output_type":"display_data","data":{"application/vnd.colab-display-data+json":{"pip_warning":{"packages":["seaborn"]}}},"metadata":{}}],"execution_count":2},{"cell_type":"code","source":"#Importacion de las librerias\nimport pandas as pd\nimport seaborn as sns\nfrom pandas_profiling import ProfileReport\nfrom funpymodeling.exploratory import cat_vars, num_vars\nimport numpy as np","metadata":{"id":"9NEJ9_A5EmTs","cell_id":"699858af16de42499429c9dd23883304","executionInfo":{"user":{"userId":"04741209928239412574","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Gi4e7mWJaOA2l-1KUn-omyigRGSrm83lG6XLzS5=s64","displayName":"david francisco bustos usta"},"status":"ok","elapsed":259,"user_tz":300,"timestamp":1633045152175},"deepnote_cell_type":"code"},"outputs":[],"execution_count":3},{"cell_type":"markdown","source":"### Preparacion de Datos","metadata":{"id":"qhcDFRmDEmT3","cell_id":"bb8f59f1524c4b5b82bd39f7431c6529","deepnote_cell_type":"markdown"}},{"cell_type":"code","source":"# Carga de datos\ndata=pd.read_csv(\"https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-21/spotify_songs.csv\")\n\n# Removemos duplicados de canciones:\ndata=data.drop_duplicates(subset=\"track_id\")\n\n#Nos quedamos unicamente con las variables numericas\nx_data=data.drop(cat_vars(data), axis=1)\n\n# Sacamos algunas variables adicionales que no aportan valor\nx_data=x_data.drop(['key','speechiness', 'mode', 'tempo', 'duration_ms'], axis=1)","metadata":{"id":"UB1IwLk5EmUD","cell_id":"5063634316bc4d47893a879776e7224b","executionInfo":{"user":{"userId":"04741209928239412574","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Gi4e7mWJaOA2l-1KUn-omyigRGSrm83lG6XLzS5=s64","displayName":"david francisco bustos usta"},"status":"ok","elapsed":1086,"user_tz":300,"timestamp":1633045159411},"deepnote_cell_type":"code"},"outputs":[],"execution_count":4},{"cell_type":"markdown","source":"### Creación del modelo de PCA","metadata":{"id":"sWA3UDxhEmUH","cell_id":"7cfc54fb64a3410680eb62305420e691","deepnote_cell_type":"markdown"}},{"cell_type":"code","source":"#Importamos la libreria para el escalado de los datos\nfrom sklearn.preprocessing import StandardScaler\n\n#Generamos el objeto\nscaler = StandardScaler()\n\n#Aplicamos la transformacion\nx_scaled = scaler.fit_transform(x_data)\n\n#Importante: Los datos no tienen que tener nulos y deben ser todos numericos","metadata":{"id":"iaUyuK0VEmUK","cell_id":"7e393c4704cd4df9a4967ba611fb7781","executionInfo":{"user":{"userId":"04741209928239412574","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Gi4e7mWJaOA2l-1KUn-omyigRGSrm83lG6XLzS5=s64","displayName":"david francisco bustos usta"},"status":"ok","elapsed":293,"user_tz":300,"timestamp":1633045162772},"deepnote_cell_type":"code"},"outputs":[],"execution_count":5},{"cell_type":"markdown","source":"Generamos el modelo y fiteamos:","metadata":{"id":"jI-tKWpmEmUM","cell_id":"45315a9581f04b5e9989236790ddb758","deepnote_cell_type":"markdown"}},{"cell_type":"code","source":"#Importamos la libreria\nfrom sklearn.decomposition import PCA\n\n#Generamos el objeto\nmodel_pca = PCA()\n\n#Aplicamos pca\nx_pca=model_pca.fit_transform(x_scaled)","metadata":{"id":"uJuCoHpREmUO","cell_id":"71e21c663aab4577a2e47b89347dd126","executionInfo":{"user":{"userId":"04741209928239412574","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Gi4e7mWJaOA2l-1KUn-omyigRGSrm83lG6XLzS5=s64","displayName":"david francisco bustos usta"},"status":"ok","elapsed":291,"user_tz":300,"timestamp":1633045165650},"deepnote_cell_type":"code"},"outputs":[],"execution_count":6},{"cell_type":"code","source":"#Variaanza explicada de las componentes\nvar_explicada_pca = model_pca.explained_variance_ratio_\nvar_explicada_pca","metadata":{"id":"1VZTpgdREmUT","colab":{"base_uri":"https://localhost:8080/"},"cell_id":"fde184aa99ba40698baf340b13a00fcd","outputId":"c82a909f-f2c8-4401-af85-fdf5bc2cfb04","executionInfo":{"user":{"userId":"04741209928239412574","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Gi4e7mWJaOA2l-1KUn-omyigRGSrm83lG6XLzS5=s64","displayName":"david francisco bustos usta"},"status":"ok","elapsed":10,"user_tz":300,"timestamp":1633045167410},"deepnote_cell_type":"code"},"outputs":[{"output_type":"execute_result","data":{"text/plain":"array([0.26718277, 0.17844569, 0.14004096, 0.12199857, 0.10753268,\n 0.08223481, 0.0748838 , 0.02768073])"},"metadata":{},"execution_count":7}],"execution_count":7},{"cell_type":"markdown","source":"**Interpretación**:\n\nLa primer componente aporta el 26 % de la varianza explicada, la segunda el 17% y así sucesivamente","metadata":{"id":"G0qWr9t0EmUd","cell_id":"3e9dfad5dd7944abbcca1846bdd9148d","deepnote_cell_type":"markdown"}},{"cell_type":"markdown","source":"# UMAP","metadata":{"id":"j1iKfljSEmUf","cell_id":"91d16279e2f94f6583f4050345232c63","deepnote_cell_type":"markdown"}},{"cell_type":"markdown","source":"UMAP es un método de reducción de dimensionalidad no lineal y es muy eficaz para visualizar agrupaciones o grupos de puntos de datos y sus proximidades relativas.\n\nLink de Interes: \n\n* https://towardsdatascience.com/dimensionality-reduction-for-data-visualization-pca-vs-tsne-vs-umap-be4aa7b1cb29","metadata":{"id":"ntcjLvhfEmUh","cell_id":"4609a856b99c44e085713a04f5b265db","deepnote_cell_type":"markdown"}},{"cell_type":"code","source":"!pip3 install umap-learn","metadata":{"id":"E_d8Rz7dE-JK","colab":{"base_uri":"https://localhost:8080/"},"cell_id":"163578c54a2a4906a4d221f2bdfde180","outputId":"a468ad2a-0819-498d-ed21-a7ce2ac0b058","executionInfo":{"user":{"userId":"04741209928239412574","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Gi4e7mWJaOA2l-1KUn-omyigRGSrm83lG6XLzS5=s64","displayName":"david francisco bustos usta"},"status":"ok","elapsed":5952,"user_tz":300,"timestamp":1633045191196},"deepnote_cell_type":"code"},"outputs":[{"output_type":"stream","name":"stdout","text":"Collecting umap-learn\n Downloading umap-learn-0.5.1.tar.gz (80 kB)\n\u001b[K |████████████████████████████████| 80 kB 4.7 MB/s \n\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from umap-learn) (1.19.5)\nRequirement already satisfied: scikit-learn>=0.22 in /usr/local/lib/python3.7/dist-packages (from umap-learn) (0.22.2.post1)\nRequirement already satisfied: scipy>=1.0 in /usr/local/lib/python3.7/dist-packages (from umap-learn) (1.4.1)\nRequirement already satisfied: numba>=0.49 in /usr/local/lib/python3.7/dist-packages (from umap-learn) (0.51.2)\nCollecting pynndescent>=0.5\n Downloading pynndescent-0.5.4.tar.gz (1.1 MB)\n\u001b[K |████████████████████████████████| 1.1 MB 16.4 MB/s \n\u001b[?25hRequirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from numba>=0.49->umap-learn) (57.4.0)\nRequirement already satisfied: llvmlite<0.35,>=0.34.0.dev0 in /usr/local/lib/python3.7/dist-packages (from numba>=0.49->umap-learn) (0.34.0)\nRequirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.7/dist-packages (from pynndescent>=0.5->umap-learn) (1.0.1)\nBuilding wheels for collected packages: umap-learn, pynndescent\n Building wheel for umap-learn (setup.py) ... \u001b[?25l\u001b[?25hdone\n Created wheel for umap-learn: filename=umap_learn-0.5.1-py3-none-any.whl size=76564 sha256=98e506631763535c0847e3680bf9717b96a571088a8935f85ab6cacbf15b61f6\n Stored in directory: /root/.cache/pip/wheels/01/e7/bb/347dc0e510803d7116a13d592b10cc68262da56a8eec4dd72f\n Building wheel for pynndescent (setup.py) ... \u001b[?25l\u001b[?25hdone\n Created wheel for pynndescent: filename=pynndescent-0.5.4-py3-none-any.whl size=52373 sha256=16714ac70828972fe718f24e299c55e5c1bacdcb34c69d29133f455013f5751b\n Stored in directory: /root/.cache/pip/wheels/d0/5b/62/3401692ddad12324249c774c4b15ccb046946021e2b581c043\nSuccessfully built umap-learn pynndescent\nInstalling collected packages: pynndescent, umap-learn\nSuccessfully installed pynndescent-0.5.4 umap-learn-0.5.1\n"}],"execution_count":9},{"cell_type":"code","source":"import umap #pip3 install umap-learn","metadata":{"id":"Ngeu7T7tEmUk","cell_id":"73dcc806151646a09f147b0977656e61","executionInfo":{"user":{"userId":"04741209928239412574","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Gi4e7mWJaOA2l-1KUn-omyigRGSrm83lG6XLzS5=s64","displayName":"david francisco bustos usta"},"status":"ok","elapsed":20025,"user_tz":300,"timestamp":1633045213660},"deepnote_cell_type":"code"},"outputs":[],"execution_count":10},{"cell_type":"code","source":"#Generamos el objeto para la estandarizacion\nx_scaled = StandardScaler()\n\n#Aplicamos la estandarizacion\nx_scaled = x_scaled.fit_transform(x_data)","metadata":{"id":"438Ju4zgEmUm","cell_id":"c624d45900db4777a7848d1dd3913d71","executionInfo":{"user":{"userId":"04741209928239412574","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Gi4e7mWJaOA2l-1KUn-omyigRGSrm83lG6XLzS5=s64","displayName":"david francisco bustos usta"},"status":"ok","elapsed":377,"user_tz":300,"timestamp":1633045217604},"deepnote_cell_type":"code"},"outputs":[],"execution_count":11},{"cell_type":"code","source":"#Obtenemos el objeto umap\nmodel_umap = umap.UMAP()","metadata":{"id":"5RFCHB5iEmUo","cell_id":"1de986ef7d98441fa714b2ef44e56c0f","executionInfo":{"user":{"userId":"04741209928239412574","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Gi4e7mWJaOA2l-1KUn-omyigRGSrm83lG6XLzS5=s64","displayName":"david francisco bustos usta"},"status":"ok","elapsed":369,"user_tz":300,"timestamp":1633045220326},"deepnote_cell_type":"code"},"outputs":[],"execution_count":12},{"cell_type":"code","source":"#Ejecutamos el umap\nmodel_umap_fit_transform = model_umap.fit_transform(x_scaled)\nmodel_umap_fit_transform.shape","metadata":{"id":"YHI2_KThEmUr","colab":{"base_uri":"https://localhost:8080/"},"cell_id":"630e27564f164eaa91646843a27b8054","outputId":"731fe988-f609-4f6d-c0d4-5179f51b5948","executionInfo":{"user":{"userId":"04741209928239412574","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Gi4e7mWJaOA2l-1KUn-omyigRGSrm83lG6XLzS5=s64","displayName":"david francisco bustos usta"},"status":"ok","elapsed":48075,"user_tz":300,"timestamp":1633045270449},"deepnote_cell_type":"code"},"outputs":[{"output_type":"stream","name":"stderr","text":"/usr/local/lib/python3.7/dist-packages/numba/np/ufunc/parallel.py:363: NumbaWarning: The TBB threading layer requires TBB version 2019.5 or later i.e., TBB_INTERFACE_VERSION >= 11005. Found TBB_INTERFACE_VERSION = 9107. The TBB threading layer is disabled.\n warnings.warn(problem)\n"},{"output_type":"execute_result","data":{"text/plain":"(28356, 2)"},"metadata":{},"execution_count":13}],"execution_count":13},{"cell_type":"markdown","source":"Visualización interactiva con Plotly!","metadata":{"id":"oznL7lqZEmUt","cell_id":"19bdc8743e9f4743b55443fdaee28b53","deepnote_cell_type":"markdown"}},{"cell_type":"code","source":"data2 = data.copy() #Hacemos un copy\ndata2['cancion'] = data2['track_artist'] + ' | ' + data2['track_name'] #Creamos una nueva variable llamada: cancion\ndata2[['dim1', 'dim2']] = model_umap_fit_transform #Agregamos las dos dimensiones generadas por umap","metadata":{"id":"JGNxa8SCEmUu","cell_id":"d985ecaffe5f45f5acfe3510e650ad91","executionInfo":{"user":{"userId":"04741209928239412574","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Gi4e7mWJaOA2l-1KUn-omyigRGSrm83lG6XLzS5=s64","displayName":"david francisco bustos usta"},"status":"ok","elapsed":403,"user_tz":300,"timestamp":1633045275212},"deepnote_cell_type":"code"},"outputs":[],"execution_count":14},{"cell_type":"code","source":"import plotly.express as px\n\nfig = px.scatter(data2, x=\"dim1\", y=\"dim2\", color=\"track_popularity\", hover_data=['cancion'])\nfig.show()","metadata":{"id":"tv-I0CBtEmUw","colab":{"height":542,"base_uri":"https://localhost:8080/"},"cell_id":"2e91e0466f83403da36e8ae5e6e302be","outputId":"13c60cff-6b28-4e29-e564-86de8cb8ed7e","executionInfo":{"user":{"userId":"04741209928239412574","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Gi4e7mWJaOA2l-1KUn-omyigRGSrm83lG6XLzS5=s64","displayName":"david francisco bustos usta"},"status":"ok","elapsed":3994,"user_tz":300,"timestamp":1633045281949},"deepnote_cell_type":"code"},"outputs":[{"output_type":"display_data","data":{"text/html":"\n\n\n
\n \n \n \n
\n \n
\n\n"},"metadata":{}}],"execution_count":15},{"cell_type":"markdown","source":"\nCreated in deepnote.com \nCreated in Deepnote","metadata":{"created_in_deepnote_cell":true,"deepnote_cell_type":"markdown"}}],"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"PCA y UMAP - CoderHouse (Ejemplo 4).ipynb","provenance":[],"collapsed_sections":[]},"deepnote":{},"kernelspec":{"name":"python3","language":"python","display_name":"Python 3"},"varInspector":{"cols":{"lenVar":40,"lenName":16,"lenType":16},"kernels_config":{"r":{"library":"var_list.r","varRefreshCmd":"cat(var_dic_list()) ","delete_cmd_prefix":"rm(","delete_cmd_postfix":") "},"python":{"library":"var_list.py","varRefreshCmd":"print(var_dic_list())","delete_cmd_prefix":"del ","delete_cmd_postfix":""}},"window_display":false,"types_to_exclude":["module","function","builtin_function_or_method","instance","_Feature"]},"language_info":{"name":"python","version":"3.8.5","mimetype":"text/x-python","file_extension":".py","pygments_lexer":"ipython3","codemirror_mode":{"name":"ipython","version":3},"nbconvert_exporter":"python"},"deepnote_notebook_id":"0c8239359eea4622a7df1b17b64a191f","deepnote_execution_queue":[]}}