{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Import necessary dependencies and settings"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Transforming Nominal Features"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Name | \n",
" Platform | \n",
" Year | \n",
" Genre | \n",
" Publisher | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" Super Mario Bros. | \n",
" NES | \n",
" 1985.0 | \n",
" Platform | \n",
" Nintendo | \n",
"
\n",
" \n",
" 2 | \n",
" Mario Kart Wii | \n",
" Wii | \n",
" 2008.0 | \n",
" Racing | \n",
" Nintendo | \n",
"
\n",
" \n",
" 3 | \n",
" Wii Sports Resort | \n",
" Wii | \n",
" 2009.0 | \n",
" Sports | \n",
" Nintendo | \n",
"
\n",
" \n",
" 4 | \n",
" Pokemon Red/Pokemon Blue | \n",
" GB | \n",
" 1996.0 | \n",
" Role-Playing | \n",
" Nintendo | \n",
"
\n",
" \n",
" 5 | \n",
" Tetris | \n",
" GB | \n",
" 1989.0 | \n",
" Puzzle | \n",
" Nintendo | \n",
"
\n",
" \n",
" 6 | \n",
" New Super Mario Bros. | \n",
" DS | \n",
" 2006.0 | \n",
" Platform | \n",
" Nintendo | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Name Platform Year Genre Publisher\n",
"1 Super Mario Bros. NES 1985.0 Platform Nintendo\n",
"2 Mario Kart Wii Wii 2008.0 Racing Nintendo\n",
"3 Wii Sports Resort Wii 2009.0 Sports Nintendo\n",
"4 Pokemon Red/Pokemon Blue GB 1996.0 Role-Playing Nintendo\n",
"5 Tetris GB 1989.0 Puzzle Nintendo\n",
"6 New Super Mario Bros. DS 2006.0 Platform Nintendo"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vg_df = pd.read_csv('datasets/vgsales.csv', encoding='utf-8')\n",
"vg_df[['Name', 'Platform', 'Year', 'Genre', 'Publisher']].iloc[1:7]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array(['Action', 'Adventure', 'Fighting', 'Misc', 'Platform', 'Puzzle',\n",
" 'Racing', 'Role-Playing', 'Shooter', 'Simulation', 'Sports',\n",
" 'Strategy'], dtype=object)"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"genres = np.unique(vg_df['Genre'])\n",
"genres"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"{0: 'Action',\n",
" 1: 'Adventure',\n",
" 2: 'Fighting',\n",
" 3: 'Misc',\n",
" 4: 'Platform',\n",
" 5: 'Puzzle',\n",
" 6: 'Racing',\n",
" 7: 'Role-Playing',\n",
" 8: 'Shooter',\n",
" 9: 'Simulation',\n",
" 10: 'Sports',\n",
" 11: 'Strategy'}"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.preprocessing import LabelEncoder\n",
"\n",
"gle = LabelEncoder()\n",
"genre_labels = gle.fit_transform(vg_df['Genre'])\n",
"genre_mappings = {index: label for index, label in enumerate(gle.classes_)}\n",
"genre_mappings"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Name | \n",
" Platform | \n",
" Year | \n",
" Genre | \n",
" GenreLabel | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" Super Mario Bros. | \n",
" NES | \n",
" 1985.0 | \n",
" Platform | \n",
" 4 | \n",
"
\n",
" \n",
" 2 | \n",
" Mario Kart Wii | \n",
" Wii | \n",
" 2008.0 | \n",
" Racing | \n",
" 6 | \n",
"
\n",
" \n",
" 3 | \n",
" Wii Sports Resort | \n",
" Wii | \n",
" 2009.0 | \n",
" Sports | \n",
" 10 | \n",
"
\n",
" \n",
" 4 | \n",
" Pokemon Red/Pokemon Blue | \n",
" GB | \n",
" 1996.0 | \n",
" Role-Playing | \n",
" 7 | \n",
"
\n",
" \n",
" 5 | \n",
" Tetris | \n",
" GB | \n",
" 1989.0 | \n",
" Puzzle | \n",
" 5 | \n",
"
\n",
" \n",
" 6 | \n",
" New Super Mario Bros. | \n",
" DS | \n",
" 2006.0 | \n",
" Platform | \n",
" 4 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Name Platform Year Genre GenreLabel\n",
"1 Super Mario Bros. NES 1985.0 Platform 4\n",
"2 Mario Kart Wii Wii 2008.0 Racing 6\n",
"3 Wii Sports Resort Wii 2009.0 Sports 10\n",
"4 Pokemon Red/Pokemon Blue GB 1996.0 Role-Playing 7\n",
"5 Tetris GB 1989.0 Puzzle 5\n",
"6 New Super Mario Bros. DS 2006.0 Platform 4"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vg_df['GenreLabel'] = genre_labels\n",
"vg_df[['Name', 'Platform', 'Year', 'Genre', 'GenreLabel']].iloc[1:7]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Transforming Ordinal Features"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array(['Gen 1', 'Gen 2', 'Gen 3', 'Gen 4', 'Gen 5', 'Gen 6'], dtype=object)"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"poke_df = pd.read_csv('datasets/Pokemon.csv', encoding='utf-8')\n",
"poke_df = poke_df.sample(random_state=1, frac=1).reset_index(drop=True)\n",
"\n",
"np.unique(poke_df['Generation'])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Name | \n",
" Generation | \n",
" GenerationLabel | \n",
"
\n",
" \n",
" \n",
" \n",
" 4 | \n",
" Octillery | \n",
" Gen 2 | \n",
" 2 | \n",
"
\n",
" \n",
" 5 | \n",
" Helioptile | \n",
" Gen 6 | \n",
" 6 | \n",
"
\n",
" \n",
" 6 | \n",
" Dialga | \n",
" Gen 4 | \n",
" 4 | \n",
"
\n",
" \n",
" 7 | \n",
" DeoxysDefense Forme | \n",
" Gen 3 | \n",
" 3 | \n",
"
\n",
" \n",
" 8 | \n",
" Rapidash | \n",
" Gen 1 | \n",
" 1 | \n",
"
\n",
" \n",
" 9 | \n",
" Swanna | \n",
" Gen 5 | \n",
" 5 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Name Generation GenerationLabel\n",
"4 Octillery Gen 2 2\n",
"5 Helioptile Gen 6 6\n",
"6 Dialga Gen 4 4\n",
"7 DeoxysDefense Forme Gen 3 3\n",
"8 Rapidash Gen 1 1\n",
"9 Swanna Gen 5 5"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gen_ord_map = {'Gen 1': 1, 'Gen 2': 2, 'Gen 3': 3, \n",
" 'Gen 4': 4, 'Gen 5': 5, 'Gen 6': 6}\n",
"\n",
"poke_df['GenerationLabel'] = poke_df['Generation'].map(gen_ord_map)\n",
"poke_df[['Name', 'Generation', 'GenerationLabel']].iloc[4:10]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Encoding Categorical Features"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## One-hot Encoding Scheme"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Name | \n",
" Generation | \n",
" Legendary | \n",
"
\n",
" \n",
" \n",
" \n",
" 4 | \n",
" Octillery | \n",
" Gen 2 | \n",
" False | \n",
"
\n",
" \n",
" 5 | \n",
" Helioptile | \n",
" Gen 6 | \n",
" False | \n",
"
\n",
" \n",
" 6 | \n",
" Dialga | \n",
" Gen 4 | \n",
" True | \n",
"
\n",
" \n",
" 7 | \n",
" DeoxysDefense Forme | \n",
" Gen 3 | \n",
" True | \n",
"
\n",
" \n",
" 8 | \n",
" Rapidash | \n",
" Gen 1 | \n",
" False | \n",
"
\n",
" \n",
" 9 | \n",
" Swanna | \n",
" Gen 5 | \n",
" False | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Name Generation Legendary\n",
"4 Octillery Gen 2 False\n",
"5 Helioptile Gen 6 False\n",
"6 Dialga Gen 4 True\n",
"7 DeoxysDefense Forme Gen 3 True\n",
"8 Rapidash Gen 1 False\n",
"9 Swanna Gen 5 False"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"poke_df[['Name', 'Generation', 'Legendary']].iloc[4:10]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Name | \n",
" Generation | \n",
" Gen_Label | \n",
" Legendary | \n",
" Lgnd_Label | \n",
"
\n",
" \n",
" \n",
" \n",
" 4 | \n",
" Octillery | \n",
" Gen 2 | \n",
" 1 | \n",
" False | \n",
" 0 | \n",
"
\n",
" \n",
" 5 | \n",
" Helioptile | \n",
" Gen 6 | \n",
" 5 | \n",
" False | \n",
" 0 | \n",
"
\n",
" \n",
" 6 | \n",
" Dialga | \n",
" Gen 4 | \n",
" 3 | \n",
" True | \n",
" 1 | \n",
"
\n",
" \n",
" 7 | \n",
" DeoxysDefense Forme | \n",
" Gen 3 | \n",
" 2 | \n",
" True | \n",
" 1 | \n",
"
\n",
" \n",
" 8 | \n",
" Rapidash | \n",
" Gen 1 | \n",
" 0 | \n",
" False | \n",
" 0 | \n",
"
\n",
" \n",
" 9 | \n",
" Swanna | \n",
" Gen 5 | \n",
" 4 | \n",
" False | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Name Generation Gen_Label Legendary Lgnd_Label\n",
"4 Octillery Gen 2 1 False 0\n",
"5 Helioptile Gen 6 5 False 0\n",
"6 Dialga Gen 4 3 True 1\n",
"7 DeoxysDefense Forme Gen 3 2 True 1\n",
"8 Rapidash Gen 1 0 False 0\n",
"9 Swanna Gen 5 4 False 0"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.preprocessing import OneHotEncoder, LabelEncoder\n",
"\n",
"# transform and map pokemon generations\n",
"gen_le = LabelEncoder()\n",
"gen_labels = gen_le.fit_transform(poke_df['Generation'])\n",
"poke_df['Gen_Label'] = gen_labels\n",
"\n",
"# transform and map pokemon legendary status\n",
"leg_le = LabelEncoder()\n",
"leg_labels = leg_le.fit_transform(poke_df['Legendary'])\n",
"poke_df['Lgnd_Label'] = leg_labels\n",
"\n",
"poke_df_sub = poke_df[['Name', 'Generation', 'Gen_Label', 'Legendary', 'Lgnd_Label']]\n",
"poke_df_sub.iloc[4:10]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# encode generation labels using one-hot encoding scheme\n",
"gen_ohe = OneHotEncoder()\n",
"gen_feature_arr = gen_ohe.fit_transform(poke_df[['Gen_Label']]).toarray()\n",
"gen_feature_labels = list(gen_le.classes_)\n",
"gen_features = pd.DataFrame(gen_feature_arr, columns=gen_feature_labels)\n",
"\n",
"# encode legendary status labels using one-hot encoding scheme\n",
"leg_ohe = OneHotEncoder()\n",
"leg_feature_arr = leg_ohe.fit_transform(poke_df[['Lgnd_Label']]).toarray()\n",
"leg_feature_labels = ['Legendary_'+str(cls_label) for cls_label in leg_le.classes_]\n",
"leg_features = pd.DataFrame(leg_feature_arr, columns=leg_feature_labels)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Name | \n",
" Generation | \n",
" Gen_Label | \n",
" Gen 1 | \n",
" Gen 2 | \n",
" Gen 3 | \n",
" Gen 4 | \n",
" Gen 5 | \n",
" Gen 6 | \n",
" Legendary | \n",
" Lgnd_Label | \n",
" Legendary_False | \n",
" Legendary_True | \n",
"
\n",
" \n",
" \n",
" \n",
" 4 | \n",
" Octillery | \n",
" Gen 2 | \n",
" 1 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" False | \n",
" 0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 5 | \n",
" Helioptile | \n",
" Gen 6 | \n",
" 5 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" False | \n",
" 0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 6 | \n",
" Dialga | \n",
" Gen 4 | \n",
" 3 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" True | \n",
" 1 | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 7 | \n",
" DeoxysDefense Forme | \n",
" Gen 3 | \n",
" 2 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" True | \n",
" 1 | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 8 | \n",
" Rapidash | \n",
" Gen 1 | \n",
" 0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" False | \n",
" 0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 9 | \n",
" Swanna | \n",
" Gen 5 | \n",
" 4 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" False | \n",
" 0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Name Generation Gen_Label Gen 1 Gen 2 Gen 3 Gen 4 \\\n",
"4 Octillery Gen 2 1 0.0 1.0 0.0 0.0 \n",
"5 Helioptile Gen 6 5 0.0 0.0 0.0 0.0 \n",
"6 Dialga Gen 4 3 0.0 0.0 0.0 1.0 \n",
"7 DeoxysDefense Forme Gen 3 2 0.0 0.0 1.0 0.0 \n",
"8 Rapidash Gen 1 0 1.0 0.0 0.0 0.0 \n",
"9 Swanna Gen 5 4 0.0 0.0 0.0 0.0 \n",
"\n",
" Gen 5 Gen 6 Legendary Lgnd_Label Legendary_False Legendary_True \n",
"4 0.0 0.0 False 0 1.0 0.0 \n",
"5 0.0 1.0 False 0 1.0 0.0 \n",
"6 0.0 0.0 True 1 0.0 1.0 \n",
"7 0.0 0.0 True 1 0.0 1.0 \n",
"8 0.0 0.0 False 0 1.0 0.0 \n",
"9 1.0 0.0 False 0 1.0 0.0 "
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"poke_df_ohe = pd.concat([poke_df_sub, gen_features, leg_features], axis=1)\n",
"columns = sum([['Name', 'Generation', 'Gen_Label'],gen_feature_labels,\n",
" ['Legendary', 'Lgnd_Label'],leg_feature_labels], [])\n",
"poke_df_ohe[columns].iloc[4:10]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Name | \n",
" Generation | \n",
" Legendary | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" PikaZoom | \n",
" Gen 3 | \n",
" True | \n",
"
\n",
" \n",
" 1 | \n",
" CharMyToast | \n",
" Gen 4 | \n",
" False | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Name Generation Legendary\n",
"0 PikaZoom Gen 3 True\n",
"1 CharMyToast Gen 4 False"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"new_poke_df = pd.DataFrame([['PikaZoom', 'Gen 3', True], \n",
" ['CharMyToast', 'Gen 4', False]],\n",
" columns=['Name', 'Generation', 'Legendary'])\n",
"new_poke_df"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Name | \n",
" Generation | \n",
" Gen_Label | \n",
" Legendary | \n",
" Lgnd_Label | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" PikaZoom | \n",
" Gen 3 | \n",
" 2 | \n",
" True | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" CharMyToast | \n",
" Gen 4 | \n",
" 3 | \n",
" False | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Name Generation Gen_Label Legendary Lgnd_Label\n",
"0 PikaZoom Gen 3 2 True 1\n",
"1 CharMyToast Gen 4 3 False 0"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"new_gen_labels = gen_le.transform(new_poke_df['Generation'])\n",
"new_poke_df['Gen_Label'] = new_gen_labels\n",
"\n",
"new_leg_labels = leg_le.transform(new_poke_df['Legendary'])\n",
"new_poke_df['Lgnd_Label'] = new_leg_labels\n",
"\n",
"new_poke_df[['Name', 'Generation', 'Gen_Label', 'Legendary', 'Lgnd_Label']]"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Name | \n",
" Generation | \n",
" Gen_Label | \n",
" Gen 1 | \n",
" Gen 2 | \n",
" Gen 3 | \n",
" Gen 4 | \n",
" Gen 5 | \n",
" Gen 6 | \n",
" Legendary | \n",
" Lgnd_Label | \n",
" Legendary_False | \n",
" Legendary_True | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" PikaZoom | \n",
" Gen 3 | \n",
" 2 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" True | \n",
" 1 | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 1 | \n",
" CharMyToast | \n",
" Gen 4 | \n",
" 3 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" False | \n",
" 0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Name Generation Gen_Label Gen 1 Gen 2 Gen 3 Gen 4 Gen 5 \\\n",
"0 PikaZoom Gen 3 2 0.0 0.0 1.0 0.0 0.0 \n",
"1 CharMyToast Gen 4 3 0.0 0.0 0.0 1.0 0.0 \n",
"\n",
" Gen 6 Legendary Lgnd_Label Legendary_False Legendary_True \n",
"0 0.0 True 1 0.0 1.0 \n",
"1 0.0 False 0 1.0 0.0 "
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"new_gen_feature_arr = gen_ohe.transform(new_poke_df[['Gen_Label']]).toarray()\n",
"new_gen_features = pd.DataFrame(new_gen_feature_arr, columns=gen_feature_labels)\n",
"\n",
"new_leg_feature_arr = leg_ohe.transform(new_poke_df[['Lgnd_Label']]).toarray()\n",
"new_leg_features = pd.DataFrame(new_leg_feature_arr, columns=leg_feature_labels)\n",
"\n",
"new_poke_ohe = pd.concat([new_poke_df, new_gen_features, new_leg_features], axis=1)\n",
"columns = sum([['Name', 'Generation', 'Gen_Label'], gen_feature_labels,\n",
" ['Legendary', 'Lgnd_Label'], leg_feature_labels], [])\n",
"new_poke_ohe[columns]"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Name | \n",
" Generation | \n",
" Gen 1 | \n",
" Gen 2 | \n",
" Gen 3 | \n",
" Gen 4 | \n",
" Gen 5 | \n",
" Gen 6 | \n",
"
\n",
" \n",
" \n",
" \n",
" 4 | \n",
" Octillery | \n",
" Gen 2 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 5 | \n",
" Helioptile | \n",
" Gen 6 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 6 | \n",
" Dialga | \n",
" Gen 4 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 7 | \n",
" DeoxysDefense Forme | \n",
" Gen 3 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 8 | \n",
" Rapidash | \n",
" Gen 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 9 | \n",
" Swanna | \n",
" Gen 5 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Name Generation Gen 1 Gen 2 Gen 3 Gen 4 Gen 5 Gen 6\n",
"4 Octillery Gen 2 0 1 0 0 0 0\n",
"5 Helioptile Gen 6 0 0 0 0 0 1\n",
"6 Dialga Gen 4 0 0 0 1 0 0\n",
"7 DeoxysDefense Forme Gen 3 0 0 1 0 0 0\n",
"8 Rapidash Gen 1 1 0 0 0 0 0\n",
"9 Swanna Gen 5 0 0 0 0 1 0"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gen_onehot_features = pd.get_dummies(poke_df['Generation'])\n",
"pd.concat([poke_df[['Name', 'Generation']], gen_onehot_features], axis=1).iloc[4:10]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Dummy Coding Scheme"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Name | \n",
" Generation | \n",
" Gen 2 | \n",
" Gen 3 | \n",
" Gen 4 | \n",
" Gen 5 | \n",
" Gen 6 | \n",
"
\n",
" \n",
" \n",
" \n",
" 4 | \n",
" Octillery | \n",
" Gen 2 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 5 | \n",
" Helioptile | \n",
" Gen 6 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 6 | \n",
" Dialga | \n",
" Gen 4 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 7 | \n",
" DeoxysDefense Forme | \n",
" Gen 3 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 8 | \n",
" Rapidash | \n",
" Gen 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 9 | \n",
" Swanna | \n",
" Gen 5 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Name Generation Gen 2 Gen 3 Gen 4 Gen 5 Gen 6\n",
"4 Octillery Gen 2 1 0 0 0 0\n",
"5 Helioptile Gen 6 0 0 0 0 1\n",
"6 Dialga Gen 4 0 0 1 0 0\n",
"7 DeoxysDefense Forme Gen 3 0 1 0 0 0\n",
"8 Rapidash Gen 1 0 0 0 0 0\n",
"9 Swanna Gen 5 0 0 0 1 0"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gen_dummy_features = pd.get_dummies(poke_df['Generation'], drop_first=True)\n",
"pd.concat([poke_df[['Name', 'Generation']], gen_dummy_features], axis=1).iloc[4:10]"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Name | \n",
" Generation | \n",
" Gen 1 | \n",
" Gen 2 | \n",
" Gen 3 | \n",
" Gen 4 | \n",
" Gen 5 | \n",
"
\n",
" \n",
" \n",
" \n",
" 4 | \n",
" Octillery | \n",
" Gen 2 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 5 | \n",
" Helioptile | \n",
" Gen 6 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 6 | \n",
" Dialga | \n",
" Gen 4 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 7 | \n",
" DeoxysDefense Forme | \n",
" Gen 3 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 8 | \n",
" Rapidash | \n",
" Gen 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 9 | \n",
" Swanna | \n",
" Gen 5 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Name Generation Gen 1 Gen 2 Gen 3 Gen 4 Gen 5\n",
"4 Octillery Gen 2 0 1 0 0 0\n",
"5 Helioptile Gen 6 0 0 0 0 0\n",
"6 Dialga Gen 4 0 0 0 1 0\n",
"7 DeoxysDefense Forme Gen 3 0 0 1 0 0\n",
"8 Rapidash Gen 1 1 0 0 0 0\n",
"9 Swanna Gen 5 0 0 0 0 1"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gen_onehot_features = pd.get_dummies(poke_df['Generation'])\n",
"gen_dummy_features = gen_onehot_features.iloc[:,:-1]\n",
"pd.concat([poke_df[['Name', 'Generation']], gen_dummy_features], axis=1).iloc[4:10]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Effect Coding Scheme"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Program Files\\Anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py:517: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
" self.obj[item] = s\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Name | \n",
" Generation | \n",
" Gen 1 | \n",
" Gen 2 | \n",
" Gen 3 | \n",
" Gen 4 | \n",
" Gen 5 | \n",
"
\n",
" \n",
" \n",
" \n",
" 4 | \n",
" Octillery | \n",
" Gen 2 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 5 | \n",
" Helioptile | \n",
" Gen 6 | \n",
" -1.0 | \n",
" -1.0 | \n",
" -1.0 | \n",
" -1.0 | \n",
" -1.0 | \n",
"
\n",
" \n",
" 6 | \n",
" Dialga | \n",
" Gen 4 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 7 | \n",
" DeoxysDefense Forme | \n",
" Gen 3 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 8 | \n",
" Rapidash | \n",
" Gen 1 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 9 | \n",
" Swanna | \n",
" Gen 5 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Name Generation Gen 1 Gen 2 Gen 3 Gen 4 Gen 5\n",
"4 Octillery Gen 2 0.0 1.0 0.0 0.0 0.0\n",
"5 Helioptile Gen 6 -1.0 -1.0 -1.0 -1.0 -1.0\n",
"6 Dialga Gen 4 0.0 0.0 0.0 1.0 0.0\n",
"7 DeoxysDefense Forme Gen 3 0.0 0.0 1.0 0.0 0.0\n",
"8 Rapidash Gen 1 1.0 0.0 0.0 0.0 0.0\n",
"9 Swanna Gen 5 0.0 0.0 0.0 0.0 1.0"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gen_onehot_features = pd.get_dummies(poke_df['Generation'])\n",
"gen_effect_features = gen_onehot_features.iloc[:,:-1]\n",
"gen_effect_features.loc[np.all(gen_effect_features == 0, axis=1)] = -1.\n",
"pd.concat([poke_df[['Name', 'Generation']], gen_effect_features], axis=1).iloc[4:10]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Feature Hashing scheme"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total game genres: 12\n",
"['Action' 'Adventure' 'Fighting' 'Misc' 'Platform' 'Puzzle' 'Racing'\n",
" 'Role-Playing' 'Shooter' 'Simulation' 'Sports' 'Strategy']\n"
]
}
],
"source": [
"unique_genres = np.unique(vg_df[['Genre']])\n",
"print(\"Total game genres:\", len(unique_genres))\n",
"print(unique_genres)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Name | \n",
" Genre | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" 5 | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" Super Mario Bros. | \n",
" Platform | \n",
" 0.0 | \n",
" 2.0 | \n",
" 2.0 | \n",
" -1.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2 | \n",
" Mario Kart Wii | \n",
" Racing | \n",
" -1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" -1.0 | \n",
"
\n",
" \n",
" 3 | \n",
" Wii Sports Resort | \n",
" Sports | \n",
" -2.0 | \n",
" 2.0 | \n",
" 0.0 | \n",
" -2.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 4 | \n",
" Pokemon Red/Pokemon Blue | \n",
" Role-Playing | \n",
" -1.0 | \n",
" 1.0 | \n",
" 2.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" -1.0 | \n",
"
\n",
" \n",
" 5 | \n",
" Tetris | \n",
" Puzzle | \n",
" 0.0 | \n",
" 1.0 | \n",
" 1.0 | \n",
" -2.0 | \n",
" 1.0 | \n",
" -1.0 | \n",
"
\n",
" \n",
" 6 | \n",
" New Super Mario Bros. | \n",
" Platform | \n",
" 0.0 | \n",
" 2.0 | \n",
" 2.0 | \n",
" -1.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Name Genre 0 1 2 3 4 5\n",
"1 Super Mario Bros. Platform 0.0 2.0 2.0 -1.0 1.0 0.0\n",
"2 Mario Kart Wii Racing -1.0 0.0 0.0 0.0 0.0 -1.0\n",
"3 Wii Sports Resort Sports -2.0 2.0 0.0 -2.0 0.0 0.0\n",
"4 Pokemon Red/Pokemon Blue Role-Playing -1.0 1.0 2.0 0.0 1.0 -1.0\n",
"5 Tetris Puzzle 0.0 1.0 1.0 -2.0 1.0 -1.0\n",
"6 New Super Mario Bros. Platform 0.0 2.0 2.0 -1.0 1.0 0.0"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.feature_extraction import FeatureHasher\n",
"\n",
"fh = FeatureHasher(n_features=6, input_type='string')\n",
"hashed_features = fh.fit_transform(vg_df['Genre'])\n",
"hashed_features = hashed_features.toarray()\n",
"pd.concat([vg_df[['Name', 'Genre']], pd.DataFrame(hashed_features)], axis=1).iloc[1:7]"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"{'dtype': numpy.float64,\n",
" 'input_type': 'string',\n",
" 'n_features': 6,\n",
" 'non_negative': False}"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fh.get_params()"
]
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [conda root]",
"language": "python",
"name": "conda-root-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 1
}