{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Cross validation" ] }, { "cell_type": "code", "execution_count": 100, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "from sklearn import metrics\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.linear_model import LogisticRegressionCV\n", "from sklearn.cross_validation import KFold, cross_val_score\n", "from sklearn.cross_validation import train_test_split\n", "%matplotlib inline \n", "#Don't use this command in canopy or in python\n", "\n", "path = 'data/adult.csv'\n", "data = pd.read_csv(path)\n", "# remove rows where occupation is unknown\n", "data = data[data.occupation != '?']\n", "raw_data = data[data.occupation != '?']" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", " | age | \n", "workclass | \n", "fnlwgt | \n", "education | \n", "education.num | \n", "marital.status | \n", "occupation | \n", "relationship | \n", "race | \n", "sex | \n", "capital.gain | \n", "capital.loss | \n", "hours.per.week | \n", "native.country | \n", "income | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | \n", "82 | \n", "Private | \n", "132870 | \n", "HS-grad | \n", "9 | \n", "Widowed | \n", "Exec-managerial | \n", "Not-in-family | \n", "White | \n", "Female | \n", "0 | \n", "4356 | \n", "18 | \n", "United-States | \n", "<=50K | \n", "
3 | \n", "54 | \n", "Private | \n", "140359 | \n", "7th-8th | \n", "4 | \n", "Divorced | \n", "Machine-op-inspct | \n", "Unmarried | \n", "White | \n", "Female | \n", "0 | \n", "3900 | \n", "40 | \n", "United-States | \n", "<=50K | \n", "
4 | \n", "41 | \n", "Private | \n", "264663 | \n", "Some-college | \n", "10 | \n", "Separated | \n", "Prof-specialty | \n", "Own-child | \n", "White | \n", "Female | \n", "0 | \n", "3900 | \n", "40 | \n", "United-States | \n", "<=50K | \n", "
5 | \n", "34 | \n", "Private | \n", "216864 | \n", "HS-grad | \n", "9 | \n", "Divorced | \n", "Other-service | \n", "Unmarried | \n", "White | \n", "Female | \n", "0 | \n", "3770 | \n", "45 | \n", "United-States | \n", "<=50K | \n", "
6 | \n", "38 | \n", "Private | \n", "150601 | \n", "10th | \n", "6 | \n", "Separated | \n", "Adm-clerical | \n", "Unmarried | \n", "White | \n", "Male | \n", "0 | \n", "3770 | \n", "40 | \n", "United-States | \n", "<=50K | \n", "
\n", " | age | \n", "workclass | \n", "fnlwgt | \n", "education | \n", "education.num | \n", "marital.status | \n", "occupation | \n", "relationship | \n", "race | \n", "sex | \n", "... | \n", "capital.loss | \n", "hours.per.week | \n", "native.country | \n", "income | \n", "workclass_num | \n", "over50K | \n", "marital_num | \n", "race_num | \n", "sex_num | \n", "rel_num | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | \n", "82 | \n", "Private | \n", "132870 | \n", "HS-grad | \n", "9 | \n", "Widowed | \n", "Exec-managerial | \n", "Not-in-family | \n", "White | \n", "Female | \n", "... | \n", "4356 | \n", "18 | \n", "United-States | \n", "<=50K | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
3 | \n", "54 | \n", "Private | \n", "140359 | \n", "7th-8th | \n", "4 | \n", "Divorced | \n", "Machine-op-inspct | \n", "Unmarried | \n", "White | \n", "Female | \n", "... | \n", "3900 | \n", "40 | \n", "United-States | \n", "<=50K | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "
4 | \n", "41 | \n", "Private | \n", "264663 | \n", "Some-college | \n", "10 | \n", "Separated | \n", "Prof-specialty | \n", "Own-child | \n", "White | \n", "Female | \n", "... | \n", "3900 | \n", "40 | \n", "United-States | \n", "<=50K | \n", "0 | \n", "0 | \n", "2 | \n", "0 | \n", "0 | \n", "0 | \n", "
5 | \n", "34 | \n", "Private | \n", "216864 | \n", "HS-grad | \n", "9 | \n", "Divorced | \n", "Other-service | \n", "Unmarried | \n", "White | \n", "Female | \n", "... | \n", "3770 | \n", "45 | \n", "United-States | \n", "<=50K | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "
6 | \n", "38 | \n", "Private | \n", "150601 | \n", "10th | \n", "6 | \n", "Separated | \n", "Adm-clerical | \n", "Unmarried | \n", "White | \n", "Male | \n", "... | \n", "3770 | \n", "40 | \n", "United-States | \n", "<=50K | \n", "0 | \n", "0 | \n", "2 | \n", "0 | \n", "1 | \n", "0 | \n", "
5 rows × 21 columns
\n", "\n", " | age | \n", "workclass | \n", "fnlwgt | \n", "education | \n", "education.num | \n", "marital.status | \n", "occupation | \n", "relationship | \n", "race | \n", "sex | \n", "... | \n", "capital.loss | \n", "hours.per.week | \n", "native.country | \n", "income | \n", "workclass_num | \n", "over50K | \n", "marital_num | \n", "race_num | \n", "sex_num | \n", "rel_num | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | \n", "82 | \n", "Private | \n", "132870 | \n", "HS-grad | \n", "9 | \n", "Widowed | \n", "Exec-managerial | \n", "Not-in-family | \n", "White | \n", "Female | \n", "... | \n", "4356 | \n", "18 | \n", "United-States | \n", "<=50K | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
3 | \n", "54 | \n", "Private | \n", "140359 | \n", "7th-8th | \n", "4 | \n", "Divorced | \n", "Machine-op-inspct | \n", "Unmarried | \n", "White | \n", "Female | \n", "... | \n", "3900 | \n", "40 | \n", "United-States | \n", "<=50K | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "
4 | \n", "41 | \n", "Private | \n", "264663 | \n", "Some-college | \n", "10 | \n", "Separated | \n", "Prof-specialty | \n", "Own-child | \n", "White | \n", "Female | \n", "... | \n", "3900 | \n", "40 | \n", "United-States | \n", "<=50K | \n", "0 | \n", "0 | \n", "2 | \n", "0 | \n", "0 | \n", "0 | \n", "
5 | \n", "34 | \n", "Private | \n", "216864 | \n", "HS-grad | \n", "9 | \n", "Divorced | \n", "Other-service | \n", "Unmarried | \n", "White | \n", "Female | \n", "... | \n", "3770 | \n", "45 | \n", "United-States | \n", "<=50K | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "
6 | \n", "38 | \n", "Private | \n", "150601 | \n", "10th | \n", "6 | \n", "Separated | \n", "Adm-clerical | \n", "Unmarried | \n", "White | \n", "Male | \n", "... | \n", "3770 | \n", "40 | \n", "United-States | \n", "<=50K | \n", "0 | \n", "0 | \n", "2 | \n", "0 | \n", "1 | \n", "0 | \n", "
5 rows × 21 columns
\n", "