{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Import necessary dependencies and settings" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Transforming Nominal Features" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NamePlatformYearGenrePublisher
1Super Mario Bros.NES1985.0PlatformNintendo
2Mario Kart WiiWii2008.0RacingNintendo
3Wii Sports ResortWii2009.0SportsNintendo
4Pokemon Red/Pokemon BlueGB1996.0Role-PlayingNintendo
5TetrisGB1989.0PuzzleNintendo
6New Super Mario Bros.DS2006.0PlatformNintendo
\n", "
" ], "text/plain": [ " Name Platform Year Genre Publisher\n", "1 Super Mario Bros. NES 1985.0 Platform Nintendo\n", "2 Mario Kart Wii Wii 2008.0 Racing Nintendo\n", "3 Wii Sports Resort Wii 2009.0 Sports Nintendo\n", "4 Pokemon Red/Pokemon Blue GB 1996.0 Role-Playing Nintendo\n", "5 Tetris GB 1989.0 Puzzle Nintendo\n", "6 New Super Mario Bros. DS 2006.0 Platform Nintendo" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vg_df = pd.read_csv('datasets/vgsales.csv', encoding='utf-8')\n", "vg_df[['Name', 'Platform', 'Year', 'Genre', 'Publisher']].iloc[1:7]" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array(['Action', 'Adventure', 'Fighting', 'Misc', 'Platform', 'Puzzle',\n", " 'Racing', 'Role-Playing', 'Shooter', 'Simulation', 'Sports',\n", " 'Strategy'], dtype=object)" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "genres = np.unique(vg_df['Genre'])\n", "genres" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{0: 'Action',\n", " 1: 'Adventure',\n", " 2: 'Fighting',\n", " 3: 'Misc',\n", " 4: 'Platform',\n", " 5: 'Puzzle',\n", " 6: 'Racing',\n", " 7: 'Role-Playing',\n", " 8: 'Shooter',\n", " 9: 'Simulation',\n", " 10: 'Sports',\n", " 11: 'Strategy'}" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.preprocessing import LabelEncoder\n", "\n", "gle = LabelEncoder()\n", "genre_labels = gle.fit_transform(vg_df['Genre'])\n", "genre_mappings = {index: label for index, label in enumerate(gle.classes_)}\n", "genre_mappings" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NamePlatformYearGenreGenreLabel
1Super Mario Bros.NES1985.0Platform4
2Mario Kart WiiWii2008.0Racing6
3Wii Sports ResortWii2009.0Sports10
4Pokemon Red/Pokemon BlueGB1996.0Role-Playing7
5TetrisGB1989.0Puzzle5
6New Super Mario Bros.DS2006.0Platform4
\n", "
" ], "text/plain": [ " Name Platform Year Genre GenreLabel\n", "1 Super Mario Bros. NES 1985.0 Platform 4\n", "2 Mario Kart Wii Wii 2008.0 Racing 6\n", "3 Wii Sports Resort Wii 2009.0 Sports 10\n", "4 Pokemon Red/Pokemon Blue GB 1996.0 Role-Playing 7\n", "5 Tetris GB 1989.0 Puzzle 5\n", "6 New Super Mario Bros. DS 2006.0 Platform 4" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vg_df['GenreLabel'] = genre_labels\n", "vg_df[['Name', 'Platform', 'Year', 'Genre', 'GenreLabel']].iloc[1:7]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Transforming Ordinal Features" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array(['Gen 1', 'Gen 2', 'Gen 3', 'Gen 4', 'Gen 5', 'Gen 6'], dtype=object)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "poke_df = pd.read_csv('datasets/Pokemon.csv', encoding='utf-8')\n", "poke_df = poke_df.sample(random_state=1, frac=1).reset_index(drop=True)\n", "\n", "np.unique(poke_df['Generation'])" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameGenerationGenerationLabel
4OctilleryGen 22
5HelioptileGen 66
6DialgaGen 44
7DeoxysDefense FormeGen 33
8RapidashGen 11
9SwannaGen 55
\n", "
" ], "text/plain": [ " Name Generation GenerationLabel\n", "4 Octillery Gen 2 2\n", "5 Helioptile Gen 6 6\n", "6 Dialga Gen 4 4\n", "7 DeoxysDefense Forme Gen 3 3\n", "8 Rapidash Gen 1 1\n", "9 Swanna Gen 5 5" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gen_ord_map = {'Gen 1': 1, 'Gen 2': 2, 'Gen 3': 3, \n", " 'Gen 4': 4, 'Gen 5': 5, 'Gen 6': 6}\n", "\n", "poke_df['GenerationLabel'] = poke_df['Generation'].map(gen_ord_map)\n", "poke_df[['Name', 'Generation', 'GenerationLabel']].iloc[4:10]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Encoding Categorical Features" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## One-hot Encoding Scheme" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameGenerationLegendary
4OctilleryGen 2False
5HelioptileGen 6False
6DialgaGen 4True
7DeoxysDefense FormeGen 3True
8RapidashGen 1False
9SwannaGen 5False
\n", "
" ], "text/plain": [ " Name Generation Legendary\n", "4 Octillery Gen 2 False\n", "5 Helioptile Gen 6 False\n", "6 Dialga Gen 4 True\n", "7 DeoxysDefense Forme Gen 3 True\n", "8 Rapidash Gen 1 False\n", "9 Swanna Gen 5 False" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "poke_df[['Name', 'Generation', 'Legendary']].iloc[4:10]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameGenerationGen_LabelLegendaryLgnd_Label
4OctilleryGen 21False0
5HelioptileGen 65False0
6DialgaGen 43True1
7DeoxysDefense FormeGen 32True1
8RapidashGen 10False0
9SwannaGen 54False0
\n", "
" ], "text/plain": [ " Name Generation Gen_Label Legendary Lgnd_Label\n", "4 Octillery Gen 2 1 False 0\n", "5 Helioptile Gen 6 5 False 0\n", "6 Dialga Gen 4 3 True 1\n", "7 DeoxysDefense Forme Gen 3 2 True 1\n", "8 Rapidash Gen 1 0 False 0\n", "9 Swanna Gen 5 4 False 0" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.preprocessing import OneHotEncoder, LabelEncoder\n", "\n", "# transform and map pokemon generations\n", "gen_le = LabelEncoder()\n", "gen_labels = gen_le.fit_transform(poke_df['Generation'])\n", "poke_df['Gen_Label'] = gen_labels\n", "\n", "# transform and map pokemon legendary status\n", "leg_le = LabelEncoder()\n", "leg_labels = leg_le.fit_transform(poke_df['Legendary'])\n", "poke_df['Lgnd_Label'] = leg_labels\n", "\n", "poke_df_sub = poke_df[['Name', 'Generation', 'Gen_Label', 'Legendary', 'Lgnd_Label']]\n", "poke_df_sub.iloc[4:10]" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# encode generation labels using one-hot encoding scheme\n", "gen_ohe = OneHotEncoder()\n", "gen_feature_arr = gen_ohe.fit_transform(poke_df[['Gen_Label']]).toarray()\n", "gen_feature_labels = list(gen_le.classes_)\n", "gen_features = pd.DataFrame(gen_feature_arr, columns=gen_feature_labels)\n", "\n", "# encode legendary status labels using one-hot encoding scheme\n", "leg_ohe = OneHotEncoder()\n", "leg_feature_arr = leg_ohe.fit_transform(poke_df[['Lgnd_Label']]).toarray()\n", "leg_feature_labels = ['Legendary_'+str(cls_label) for cls_label in leg_le.classes_]\n", "leg_features = pd.DataFrame(leg_feature_arr, columns=leg_feature_labels)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameGenerationGen_LabelGen 1Gen 2Gen 3Gen 4Gen 5Gen 6LegendaryLgnd_LabelLegendary_FalseLegendary_True
4OctilleryGen 210.01.00.00.00.00.0False01.00.0
5HelioptileGen 650.00.00.00.00.01.0False01.00.0
6DialgaGen 430.00.00.01.00.00.0True10.01.0
7DeoxysDefense FormeGen 320.00.01.00.00.00.0True10.01.0
8RapidashGen 101.00.00.00.00.00.0False01.00.0
9SwannaGen 540.00.00.00.01.00.0False01.00.0
\n", "
" ], "text/plain": [ " Name Generation Gen_Label Gen 1 Gen 2 Gen 3 Gen 4 \\\n", "4 Octillery Gen 2 1 0.0 1.0 0.0 0.0 \n", "5 Helioptile Gen 6 5 0.0 0.0 0.0 0.0 \n", "6 Dialga Gen 4 3 0.0 0.0 0.0 1.0 \n", "7 DeoxysDefense Forme Gen 3 2 0.0 0.0 1.0 0.0 \n", "8 Rapidash Gen 1 0 1.0 0.0 0.0 0.0 \n", "9 Swanna Gen 5 4 0.0 0.0 0.0 0.0 \n", "\n", " Gen 5 Gen 6 Legendary Lgnd_Label Legendary_False Legendary_True \n", "4 0.0 0.0 False 0 1.0 0.0 \n", "5 0.0 1.0 False 0 1.0 0.0 \n", "6 0.0 0.0 True 1 0.0 1.0 \n", "7 0.0 0.0 True 1 0.0 1.0 \n", "8 0.0 0.0 False 0 1.0 0.0 \n", "9 1.0 0.0 False 0 1.0 0.0 " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "poke_df_ohe = pd.concat([poke_df_sub, gen_features, leg_features], axis=1)\n", "columns = sum([['Name', 'Generation', 'Gen_Label'],gen_feature_labels,\n", " ['Legendary', 'Lgnd_Label'],leg_feature_labels], [])\n", "poke_df_ohe[columns].iloc[4:10]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameGenerationLegendary
0PikaZoomGen 3True
1CharMyToastGen 4False
\n", "
" ], "text/plain": [ " Name Generation Legendary\n", "0 PikaZoom Gen 3 True\n", "1 CharMyToast Gen 4 False" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_poke_df = pd.DataFrame([['PikaZoom', 'Gen 3', True], \n", " ['CharMyToast', 'Gen 4', False]],\n", " columns=['Name', 'Generation', 'Legendary'])\n", "new_poke_df" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameGenerationGen_LabelLegendaryLgnd_Label
0PikaZoomGen 32True1
1CharMyToastGen 43False0
\n", "
" ], "text/plain": [ " Name Generation Gen_Label Legendary Lgnd_Label\n", "0 PikaZoom Gen 3 2 True 1\n", "1 CharMyToast Gen 4 3 False 0" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_gen_labels = gen_le.transform(new_poke_df['Generation'])\n", "new_poke_df['Gen_Label'] = new_gen_labels\n", "\n", "new_leg_labels = leg_le.transform(new_poke_df['Legendary'])\n", "new_poke_df['Lgnd_Label'] = new_leg_labels\n", "\n", "new_poke_df[['Name', 'Generation', 'Gen_Label', 'Legendary', 'Lgnd_Label']]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameGenerationGen_LabelGen 1Gen 2Gen 3Gen 4Gen 5Gen 6LegendaryLgnd_LabelLegendary_FalseLegendary_True
0PikaZoomGen 320.00.01.00.00.00.0True10.01.0
1CharMyToastGen 430.00.00.01.00.00.0False01.00.0
\n", "
" ], "text/plain": [ " Name Generation Gen_Label Gen 1 Gen 2 Gen 3 Gen 4 Gen 5 \\\n", "0 PikaZoom Gen 3 2 0.0 0.0 1.0 0.0 0.0 \n", "1 CharMyToast Gen 4 3 0.0 0.0 0.0 1.0 0.0 \n", "\n", " Gen 6 Legendary Lgnd_Label Legendary_False Legendary_True \n", "0 0.0 True 1 0.0 1.0 \n", "1 0.0 False 0 1.0 0.0 " ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_gen_feature_arr = gen_ohe.transform(new_poke_df[['Gen_Label']]).toarray()\n", "new_gen_features = pd.DataFrame(new_gen_feature_arr, columns=gen_feature_labels)\n", "\n", "new_leg_feature_arr = leg_ohe.transform(new_poke_df[['Lgnd_Label']]).toarray()\n", "new_leg_features = pd.DataFrame(new_leg_feature_arr, columns=leg_feature_labels)\n", "\n", "new_poke_ohe = pd.concat([new_poke_df, new_gen_features, new_leg_features], axis=1)\n", "columns = sum([['Name', 'Generation', 'Gen_Label'], gen_feature_labels,\n", " ['Legendary', 'Lgnd_Label'], leg_feature_labels], [])\n", "new_poke_ohe[columns]" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameGenerationGen 1Gen 2Gen 3Gen 4Gen 5Gen 6
4OctilleryGen 2010000
5HelioptileGen 6000001
6DialgaGen 4000100
7DeoxysDefense FormeGen 3001000
8RapidashGen 1100000
9SwannaGen 5000010
\n", "
" ], "text/plain": [ " Name Generation Gen 1 Gen 2 Gen 3 Gen 4 Gen 5 Gen 6\n", "4 Octillery Gen 2 0 1 0 0 0 0\n", "5 Helioptile Gen 6 0 0 0 0 0 1\n", "6 Dialga Gen 4 0 0 0 1 0 0\n", "7 DeoxysDefense Forme Gen 3 0 0 1 0 0 0\n", "8 Rapidash Gen 1 1 0 0 0 0 0\n", "9 Swanna Gen 5 0 0 0 0 1 0" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gen_onehot_features = pd.get_dummies(poke_df['Generation'])\n", "pd.concat([poke_df[['Name', 'Generation']], gen_onehot_features], axis=1).iloc[4:10]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Dummy Coding Scheme" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameGenerationGen 2Gen 3Gen 4Gen 5Gen 6
4OctilleryGen 210000
5HelioptileGen 600001
6DialgaGen 400100
7DeoxysDefense FormeGen 301000
8RapidashGen 100000
9SwannaGen 500010
\n", "
" ], "text/plain": [ " Name Generation Gen 2 Gen 3 Gen 4 Gen 5 Gen 6\n", "4 Octillery Gen 2 1 0 0 0 0\n", "5 Helioptile Gen 6 0 0 0 0 1\n", "6 Dialga Gen 4 0 0 1 0 0\n", "7 DeoxysDefense Forme Gen 3 0 1 0 0 0\n", "8 Rapidash Gen 1 0 0 0 0 0\n", "9 Swanna Gen 5 0 0 0 1 0" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gen_dummy_features = pd.get_dummies(poke_df['Generation'], drop_first=True)\n", "pd.concat([poke_df[['Name', 'Generation']], gen_dummy_features], axis=1).iloc[4:10]" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameGenerationGen 1Gen 2Gen 3Gen 4Gen 5
4OctilleryGen 201000
5HelioptileGen 600000
6DialgaGen 400010
7DeoxysDefense FormeGen 300100
8RapidashGen 110000
9SwannaGen 500001
\n", "
" ], "text/plain": [ " Name Generation Gen 1 Gen 2 Gen 3 Gen 4 Gen 5\n", "4 Octillery Gen 2 0 1 0 0 0\n", "5 Helioptile Gen 6 0 0 0 0 0\n", "6 Dialga Gen 4 0 0 0 1 0\n", "7 DeoxysDefense Forme Gen 3 0 0 1 0 0\n", "8 Rapidash Gen 1 1 0 0 0 0\n", "9 Swanna Gen 5 0 0 0 0 1" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gen_onehot_features = pd.get_dummies(poke_df['Generation'])\n", "gen_dummy_features = gen_onehot_features.iloc[:,:-1]\n", "pd.concat([poke_df[['Name', 'Generation']], gen_dummy_features], axis=1).iloc[4:10]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Effect Coding Scheme" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Program Files\\Anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py:517: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", " self.obj[item] = s\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameGenerationGen 1Gen 2Gen 3Gen 4Gen 5
4OctilleryGen 20.01.00.00.00.0
5HelioptileGen 6-1.0-1.0-1.0-1.0-1.0
6DialgaGen 40.00.00.01.00.0
7DeoxysDefense FormeGen 30.00.01.00.00.0
8RapidashGen 11.00.00.00.00.0
9SwannaGen 50.00.00.00.01.0
\n", "
" ], "text/plain": [ " Name Generation Gen 1 Gen 2 Gen 3 Gen 4 Gen 5\n", "4 Octillery Gen 2 0.0 1.0 0.0 0.0 0.0\n", "5 Helioptile Gen 6 -1.0 -1.0 -1.0 -1.0 -1.0\n", "6 Dialga Gen 4 0.0 0.0 0.0 1.0 0.0\n", "7 DeoxysDefense Forme Gen 3 0.0 0.0 1.0 0.0 0.0\n", "8 Rapidash Gen 1 1.0 0.0 0.0 0.0 0.0\n", "9 Swanna Gen 5 0.0 0.0 0.0 0.0 1.0" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gen_onehot_features = pd.get_dummies(poke_df['Generation'])\n", "gen_effect_features = gen_onehot_features.iloc[:,:-1]\n", "gen_effect_features.loc[np.all(gen_effect_features == 0, axis=1)] = -1.\n", "pd.concat([poke_df[['Name', 'Generation']], gen_effect_features], axis=1).iloc[4:10]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Feature Hashing scheme" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total game genres: 12\n", "['Action' 'Adventure' 'Fighting' 'Misc' 'Platform' 'Puzzle' 'Racing'\n", " 'Role-Playing' 'Shooter' 'Simulation' 'Sports' 'Strategy']\n" ] } ], "source": [ "unique_genres = np.unique(vg_df[['Genre']])\n", "print(\"Total game genres:\", len(unique_genres))\n", "print(unique_genres)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameGenre012345
1Super Mario Bros.Platform0.02.02.0-1.01.00.0
2Mario Kart WiiRacing-1.00.00.00.00.0-1.0
3Wii Sports ResortSports-2.02.00.0-2.00.00.0
4Pokemon Red/Pokemon BlueRole-Playing-1.01.02.00.01.0-1.0
5TetrisPuzzle0.01.01.0-2.01.0-1.0
6New Super Mario Bros.Platform0.02.02.0-1.01.00.0
\n", "
" ], "text/plain": [ " Name Genre 0 1 2 3 4 5\n", "1 Super Mario Bros. Platform 0.0 2.0 2.0 -1.0 1.0 0.0\n", "2 Mario Kart Wii Racing -1.0 0.0 0.0 0.0 0.0 -1.0\n", "3 Wii Sports Resort Sports -2.0 2.0 0.0 -2.0 0.0 0.0\n", "4 Pokemon Red/Pokemon Blue Role-Playing -1.0 1.0 2.0 0.0 1.0 -1.0\n", "5 Tetris Puzzle 0.0 1.0 1.0 -2.0 1.0 -1.0\n", "6 New Super Mario Bros. Platform 0.0 2.0 2.0 -1.0 1.0 0.0" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.feature_extraction import FeatureHasher\n", "\n", "fh = FeatureHasher(n_features=6, input_type='string')\n", "hashed_features = fh.fit_transform(vg_df['Genre'])\n", "hashed_features = hashed_features.toarray()\n", "pd.concat([vg_df[['Name', 'Genre']], pd.DataFrame(hashed_features)], axis=1).iloc[1:7]" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'dtype': numpy.float64,\n", " 'input_type': 'string',\n", " 'n_features': 6,\n", " 'non_negative': False}" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fh.get_params()" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [conda root]", "language": "python", "name": "conda-root-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 1 }