{ "cells": [ { "cell_type": "markdown", "metadata": { "collapsed": true, "deletable": true, "editable": true }, "source": [ "##
One-Hot Encoding
\n", "\n", "- used on categorical variables\n", "- it replaces a categorical variable/feature with one or more new features that will take the values of 0 or 1\n", "- increases data burden\n", "- increases the efficiency of the process" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "import pandas as pd\n", "from IPython.display import display\n", "\n", "data = pd.read_csv('adult.data', header=None, index_col=False, names=['age', 'workclass', 'fnlwgt', 'education', \n", " 'education-num', 'marital-status', 'occupation', \n", " 'relationship', 'race', 'gender', 'capital-gain', \n", " 'capital-loss', 'hours-per-week', 'native-country', \n", " 'income'])" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ageworkclasseducationgenderhours-per-weekoccupationincome
039State-govBachelorsMale40Adm-clerical<=50K
150Self-emp-not-incBachelorsMale13Exec-managerial<=50K
238PrivateHS-gradMale40Handlers-cleaners<=50K
353Private11thMale40Handlers-cleaners<=50K
428PrivateBachelorsFemale40Prof-specialty<=50K
537PrivateMastersFemale40Exec-managerial<=50K
649Private9thFemale16Other-service<=50K
752Self-emp-not-incHS-gradMale45Exec-managerial>50K
831PrivateMastersFemale50Prof-specialty>50K
942PrivateBachelorsMale40Exec-managerial>50K
1037PrivateSome-collegeMale80Exec-managerial>50K
1130State-govBachelorsMale40Prof-specialty>50K
1223PrivateBachelorsFemale30Adm-clerical<=50K
1332PrivateAssoc-acdmMale50Sales<=50K
1440PrivateAssoc-vocMale40Craft-repair>50K
1534Private7th-8thMale45Transport-moving<=50K
1625Self-emp-not-incHS-gradMale35Farming-fishing<=50K
1732PrivateHS-gradMale40Machine-op-inspct<=50K
1838Private11thMale50Sales<=50K
1943Self-emp-not-incMastersFemale45Exec-managerial>50K
2040PrivateDoctorateMale60Prof-specialty>50K
2154PrivateHS-gradFemale20Other-service<=50K
2235Federal-gov9thMale40Farming-fishing<=50K
2343Private11thMale40Transport-moving<=50K
2459PrivateHS-gradFemale40Tech-support<=50K
2556Local-govBachelorsMale40Tech-support>50K
2619PrivateHS-gradMale40Craft-repair<=50K
2754?Some-collegeMale60?>50K
2839PrivateHS-gradMale80Exec-managerial<=50K
2949PrivateHS-gradMale40Craft-repair<=50K
........................
3253130?BachelorsFemale99?<=50K
3253234PrivateDoctorateMale60Prof-specialty>50K
3253354PrivateBachelorsMale50Exec-managerial>50K
3253437PrivateSome-collegeFemale39Adm-clerical<=50K
3253522Private12thMale35Protective-serv<=50K
3253634PrivateBachelorsFemale55Exec-managerial>50K
3253730PrivateHS-gradMale46Craft-repair<=50K
3253838PrivateBachelorsFemale45Prof-specialty>50K
3253971?DoctorateMale10?>50K
3254045State-govHS-gradFemale40Adm-clerical<=50K
3254141?HS-gradFemale32?<=50K
3254272?HS-gradMale25?<=50K
3254345Local-govAssoc-acdmFemale48Prof-specialty<=50K
3254431PrivateMastersFemale30Other-service<=50K
3254539Local-govAssoc-acdmFemale20Adm-clerical>50K
3254637PrivateAssoc-acdmFemale40Tech-support<=50K
3254743PrivateHS-gradMale40Machine-op-inspct<=50K
3254865Self-emp-not-incProf-schoolMale60Prof-specialty<=50K
3254943State-govSome-collegeFemale40Adm-clerical<=50K
3255043Self-emp-not-incSome-collegeMale50Craft-repair<=50K
3255132Private10thMale40Handlers-cleaners<=50K
3255243PrivateAssoc-vocMale45Sales<=50K
3255332PrivateMastersMale11Tech-support<=50K
3255453PrivateMastersMale40Exec-managerial>50K
3255522PrivateSome-collegeMale40Protective-serv<=50K
3255627PrivateAssoc-acdmFemale38Tech-support<=50K
3255740PrivateHS-gradMale40Machine-op-inspct>50K
3255858PrivateHS-gradFemale40Adm-clerical<=50K
3255922PrivateHS-gradMale20Adm-clerical<=50K
3256052Self-emp-incHS-gradFemale40Exec-managerial>50K
\n", "

32561 rows × 7 columns

\n", "
" ], "text/plain": [ " age workclass education gender hours-per-week \\\n", "0 39 State-gov Bachelors Male 40 \n", "1 50 Self-emp-not-inc Bachelors Male 13 \n", "2 38 Private HS-grad Male 40 \n", "3 53 Private 11th Male 40 \n", "4 28 Private Bachelors Female 40 \n", "5 37 Private Masters Female 40 \n", "6 49 Private 9th Female 16 \n", "7 52 Self-emp-not-inc HS-grad Male 45 \n", "8 31 Private Masters Female 50 \n", "9 42 Private Bachelors Male 40 \n", "10 37 Private Some-college Male 80 \n", "11 30 State-gov Bachelors Male 40 \n", "12 23 Private Bachelors Female 30 \n", "13 32 Private Assoc-acdm Male 50 \n", "14 40 Private Assoc-voc Male 40 \n", "15 34 Private 7th-8th Male 45 \n", "16 25 Self-emp-not-inc HS-grad Male 35 \n", "17 32 Private HS-grad Male 40 \n", "18 38 Private 11th Male 50 \n", "19 43 Self-emp-not-inc Masters Female 45 \n", "20 40 Private Doctorate Male 60 \n", "21 54 Private HS-grad Female 20 \n", "22 35 Federal-gov 9th Male 40 \n", "23 43 Private 11th Male 40 \n", "24 59 Private HS-grad Female 40 \n", "25 56 Local-gov Bachelors Male 40 \n", "26 19 Private HS-grad Male 40 \n", "27 54 ? Some-college Male 60 \n", "28 39 Private HS-grad Male 80 \n", "29 49 Private HS-grad Male 40 \n", "... ... ... ... ... ... \n", "32531 30 ? Bachelors Female 99 \n", "32532 34 Private Doctorate Male 60 \n", "32533 54 Private Bachelors Male 50 \n", "32534 37 Private Some-college Female 39 \n", "32535 22 Private 12th Male 35 \n", "32536 34 Private Bachelors Female 55 \n", "32537 30 Private HS-grad Male 46 \n", "32538 38 Private Bachelors Female 45 \n", "32539 71 ? Doctorate Male 10 \n", "32540 45 State-gov HS-grad Female 40 \n", "32541 41 ? HS-grad Female 32 \n", "32542 72 ? HS-grad Male 25 \n", "32543 45 Local-gov Assoc-acdm Female 48 \n", "32544 31 Private Masters Female 30 \n", "32545 39 Local-gov Assoc-acdm Female 20 \n", "32546 37 Private Assoc-acdm Female 40 \n", "32547 43 Private HS-grad Male 40 \n", "32548 65 Self-emp-not-inc Prof-school Male 60 \n", "32549 43 State-gov Some-college Female 40 \n", "32550 43 Self-emp-not-inc Some-college Male 50 \n", "32551 32 Private 10th Male 40 \n", "32552 43 Private Assoc-voc Male 45 \n", "32553 32 Private Masters Male 11 \n", "32554 53 Private Masters Male 40 \n", "32555 22 Private Some-college Male 40 \n", "32556 27 Private Assoc-acdm Female 38 \n", "32557 40 Private HS-grad Male 40 \n", "32558 58 Private HS-grad Female 40 \n", "32559 22 Private HS-grad Male 20 \n", "32560 52 Self-emp-inc HS-grad Female 40 \n", "\n", " occupation income \n", "0 Adm-clerical <=50K \n", "1 Exec-managerial <=50K \n", "2 Handlers-cleaners <=50K \n", "3 Handlers-cleaners <=50K \n", "4 Prof-specialty <=50K \n", "5 Exec-managerial <=50K \n", "6 Other-service <=50K \n", "7 Exec-managerial >50K \n", "8 Prof-specialty >50K \n", "9 Exec-managerial >50K \n", "10 Exec-managerial >50K \n", "11 Prof-specialty >50K \n", "12 Adm-clerical <=50K \n", "13 Sales <=50K \n", "14 Craft-repair >50K \n", "15 Transport-moving <=50K \n", "16 Farming-fishing <=50K \n", "17 Machine-op-inspct <=50K \n", "18 Sales <=50K \n", "19 Exec-managerial >50K \n", "20 Prof-specialty >50K \n", "21 Other-service <=50K \n", "22 Farming-fishing <=50K \n", "23 Transport-moving <=50K \n", "24 Tech-support <=50K \n", "25 Tech-support >50K \n", "26 Craft-repair <=50K \n", "27 ? >50K \n", "28 Exec-managerial <=50K \n", "29 Craft-repair <=50K \n", "... ... ... \n", "32531 ? <=50K \n", "32532 Prof-specialty >50K \n", "32533 Exec-managerial >50K \n", "32534 Adm-clerical <=50K \n", "32535 Protective-serv <=50K \n", "32536 Exec-managerial >50K \n", "32537 Craft-repair <=50K \n", "32538 Prof-specialty >50K \n", "32539 ? >50K \n", "32540 Adm-clerical <=50K \n", "32541 ? <=50K \n", "32542 ? <=50K \n", "32543 Prof-specialty <=50K \n", "32544 Other-service <=50K \n", "32545 Adm-clerical >50K \n", "32546 Tech-support <=50K \n", "32547 Machine-op-inspct <=50K \n", "32548 Prof-specialty <=50K \n", "32549 Adm-clerical <=50K \n", "32550 Craft-repair <=50K \n", "32551 Handlers-cleaners <=50K \n", "32552 Sales <=50K \n", "32553 Tech-support <=50K \n", "32554 Exec-managerial >50K \n", "32555 Protective-serv <=50K \n", "32556 Tech-support <=50K \n", "32557 Machine-op-inspct >50K \n", "32558 Adm-clerical <=50K \n", "32559 Adm-clerical <=50K \n", "32560 Exec-managerial >50K \n", "\n", "[32561 rows x 7 columns]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "data = data[['age', 'workclass', 'education', 'gender', 'hours-per-week', 'occupation', 'income']]\n", "display(data)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Original Features:\n", " ['age', 'workclass', 'education', 'gender', 'hours-per-week', 'occupation', 'income'] \n", "\n", "Features after One-Hot Encoding:\n", " ['age', 'hours-per-week', 'workclass_ ?', 'workclass_ Federal-gov', 'workclass_ Local-gov', 'workclass_ Never-worked', 'workclass_ Private', 'workclass_ Self-emp-inc', 'workclass_ Self-emp-not-inc', 'workclass_ State-gov', 'workclass_ Without-pay', 'education_ 10th', 'education_ 11th', 'education_ 12th', 'education_ 1st-4th', 'education_ 5th-6th', 'education_ 7th-8th', 'education_ 9th', 'education_ Assoc-acdm', 'education_ Assoc-voc', 'education_ Bachelors', 'education_ Doctorate', 'education_ HS-grad', 'education_ Masters', 'education_ Preschool', 'education_ Prof-school', 'education_ Some-college', 'gender_ Female', 'gender_ Male', 'occupation_ ?', 'occupation_ Adm-clerical', 'occupation_ Armed-Forces', 'occupation_ Craft-repair', 'occupation_ Exec-managerial', 'occupation_ Farming-fishing', 'occupation_ Handlers-cleaners', 'occupation_ Machine-op-inspct', 'occupation_ Other-service', 'occupation_ Priv-house-serv', 'occupation_ Prof-specialty', 'occupation_ Protective-serv', 'occupation_ Sales', 'occupation_ Tech-support', 'occupation_ Transport-moving', 'income_ <=50K', 'income_ >50K']\n" ] } ], "source": [ "print('Original Features:\\n', list(data.columns), '\\n')\n", "data_dummies = pd.get_dummies(data)\n", "print('Features after One-Hot Encoding:\\n', list(data_dummies.columns))" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "features = data_dummies.ix[:, 'age':'occupation_ Transport-moving']\n", "X = features.values\n", "y = data_dummies['income_ >50K'].values" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Logistic Regression score on the test set: 0.81\n" ] } ], "source": [ "from sklearn.linear_model import LogisticRegression\n", "from sklearn.model_selection import train_test_split\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)\n", "logreg = LogisticRegression()\n", "logreg.fit(X_train, y_train)\n", "\n", "print('Logistic Regression score on the test set: {:.2f}'.format(logreg.score(X_test, y_test)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.0" } }, "nbformat": 4, "nbformat_minor": 2 }