{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import graphlab \n", "graphlab.canvas.set_target('ipynb')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Load Loan Dataset" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2016-03-24 07:49:12,384 [INFO] graphlab.cython.cy_server, 176: GraphLab Create v1.8.5 started. Logging: C:\\Users\\erigits\\AppData\\Local\\Temp\\graphlab_server_1458794950.log.0\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "This non-commercial license of GraphLab Create is assigned to ericgithua2011@gmail.com and will expire on November 01, 2016. For commercial licensing options, visit https://dato.com/buy/.\n" ] } ], "source": [ "loans = graphlab.SFrame('lending-club-data.gl/')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idmember_idloan_amntfunded_amntfunded_amnt_invtermint_rateinstallmentgradesub_grade
10775011296599500050004975 36 months10.65162.87BB2
10774301314167250025002500 60 months15.2759.83CC4
10771751313524240024002400 36 months15.9684.33CC5
10768631277178100001000010000 36 months13.49339.31CC1
10752691311441500050005000 36 months7.9156.46AA4
10720531288686300030003000 36 months18.64109.43EE1
10717951306957560056005600 60 months21.28152.39FF2
10715701306721537553755350 60 months12.69121.45BB5
10700781305201650065006500 60 months14.65153.45CC3
10699081305008120001200012000 36 months12.69402.54BB5
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
emp_titleemp_lengthhome_ownershipannual_incis_inc_vissue_dloan_statuspymnt_plan
10+ yearsRENT24000Verified20111201T000000Fully Paidn
Ryder< 1 yearRENT30000Source Verified20111201T000000Charged Offn
10+ yearsRENT12252Not Verified20111201T000000Fully Paidn
AIR RESOURCES BOARD10+ yearsRENT49200Source Verified20111201T000000Fully Paidn
Veolia Transportaton3 yearsRENT36000Source Verified20111201T000000Fully Paidn
MKC Accounting 9 yearsRENT48000Source Verified20111201T000000Fully Paidn
4 yearsOWN40000Source Verified20111201T000000Charged Offn
Starbucks< 1 yearRENT15000Verified20111201T000000Charged Offn
Southwest Rural metro5 yearsOWN72000Not Verified20111201T000000Fully Paidn
UCLA10+ yearsOWN75000Source Verified20111201T000000Fully Paidn
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
urldescpurposetitlezip_code
https://www.lendingclub.c
om/browse/loanDetail. ...
Borrower added on
12/22/11 > I need to ...
credit_cardComputer860xx
https://www.lendingclub.c
om/browse/loanDetail. ...
Borrower added on
12/22/11 > I plan to use ...
carbike309xx
https://www.lendingclub.c
om/browse/loanDetail. ...
small_businessreal estate business606xx
https://www.lendingclub.c
om/browse/loanDetail. ...
Borrower added on
12/21/11 > to pay for ...
otherpersonel917xx
https://www.lendingclub.c
om/browse/loanDetail. ...
weddingMy wedding loan I promise
to pay back ...
852xx
https://www.lendingclub.c
om/browse/loanDetail. ...
Borrower added on
12/16/11 > Downpayment ...
carCar Downpayment900xx
https://www.lendingclub.c
om/browse/loanDetail. ...
Borrower added on
12/21/11 > I own a small ...
small_businessExpand Business & Buy
Debt Portfolio ...
958xx
https://www.lendingclub.c
om/browse/loanDetail. ...
Borrower added on
12/16/11 > I'm trying to ...
otherBuilding my credit
history. ...
774xx
https://www.lendingclub.c
om/browse/loanDetail. ...
Borrower added on
12/15/11 > I had recived ...
debt_consolidationHigh intrest
Consolidation ...
853xx
https://www.lendingclub.c
om/browse/loanDetail. ...
debt_consolidationConsolidation913xx
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
addr_statedtidelinq_2yrsearliest_cr_lineinq_last_6mthsmths_since_last_delinqmths_since_last_record
AZ27.65019850101T0000001NoneNone
GA1.0019990401T0000005NoneNone
IL8.72020011101T0000002NoneNone
CA20.0019960201T000000135None
AZ11.2020041101T0000003NoneNone
CA5.35020070101T0000002NoneNone
CA5.55020040401T0000002NoneNone
TX18.08020040901T0000000NoneNone
AZ16.12019980101T0000002NoneNone
CA10.78019891001T0000000NoneNone
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
open_accpub_recrevol_balrevol_utiltotal_accinitial_list_statusout_prncpout_prncp_invtotal_pymnt
301364883.79f0.00.05861.07
3016879.44f0.00.01008.71
20295698.510f0.00.03003.65
100559821.037f0.00.012226.3
90796328.312f0.00.05631.38
40822187.54f0.00.03938.14
110521032.613f0.00.0646.02
20927936.53f0.00.01476.19
140403220.623f0.00.07677.52
1202333667.134f0.00.013943.1
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
total_pymnt_inv...
5831.78...
1008.71...
3003.65...
12226.3...
5631.38...
3938.14...
646.02...
1469.34...
7677.52...
13943.1...
\n", "[10 rows x 68 columns]
\n", "
" ], "text/plain": [ "Columns:\n", "\tid\tint\n", "\tmember_id\tint\n", "\tloan_amnt\tint\n", "\tfunded_amnt\tint\n", "\tfunded_amnt_inv\tint\n", "\tterm\tstr\n", "\tint_rate\tfloat\n", "\tinstallment\tfloat\n", "\tgrade\tstr\n", "\tsub_grade\tstr\n", "\temp_title\tstr\n", "\temp_length\tstr\n", "\thome_ownership\tstr\n", "\tannual_inc\tint\n", "\tis_inc_v\tstr\n", "\tissue_d\tstr\n", "\tloan_status\tstr\n", "\tpymnt_plan\tstr\n", "\turl\tstr\n", "\tdesc\tstr\n", "\tpurpose\tstr\n", "\ttitle\tstr\n", "\tzip_code\tstr\n", "\taddr_state\tstr\n", "\tdti\tfloat\n", "\tdelinq_2yrs\tint\n", "\tearliest_cr_line\tstr\n", "\tinq_last_6mths\tint\n", "\tmths_since_last_delinq\tint\n", "\tmths_since_last_record\tint\n", "\topen_acc\tint\n", "\tpub_rec\tint\n", "\trevol_bal\tint\n", "\trevol_util\tfloat\n", "\ttotal_acc\tint\n", "\tinitial_list_status\tstr\n", "\tout_prncp\tfloat\n", "\tout_prncp_inv\tfloat\n", "\ttotal_pymnt\tfloat\n", "\ttotal_pymnt_inv\tfloat\n", "\ttotal_rec_prncp\tfloat\n", "\ttotal_rec_int\tfloat\n", "\ttotal_rec_late_fee\tfloat\n", "\trecoveries\tfloat\n", "\tcollection_recovery_fee\tfloat\n", "\tlast_pymnt_d\tstr\n", "\tlast_pymnt_amnt\tfloat\n", "\tnext_pymnt_d\tstr\n", "\tlast_credit_pull_d\tstr\n", "\tcollections_12_mths_ex_med\tint\n", "\tmths_since_last_major_derog\tstr\n", "\tpolicy_code\tint\n", "\tnot_compliant\tint\n", "\tstatus\tstr\n", "\tinactive_loans\tint\n", "\tbad_loans\tint\n", "\temp_length_num\tint\n", "\tgrade_num\tint\n", "\tsub_grade_num\tfloat\n", "\tdelinq_2yrs_zero\tint\n", "\tpub_rec_zero\tint\n", "\tcollections_12_mths_zero\tint\n", "\tshort_emp\tint\n", "\tpayment_inc_ratio\tfloat\n", "\tfinal_d\tstr\n", "\tlast_delinq_none\tint\n", "\tlast_record_none\tint\n", "\tlast_major_derog_none\tint\n", "\n", "Rows: 10\n", "\n", "Data:\n", "+---------+-----------+-----------+-------------+-----------------+------------+\n", "| id | member_id | loan_amnt | funded_amnt | funded_amnt_inv | term |\n", "+---------+-----------+-----------+-------------+-----------------+------------+\n", "| 1077501 | 1296599 | 5000 | 5000 | 4975 | 36 months |\n", "| 1077430 | 1314167 | 2500 | 2500 | 2500 | 60 months |\n", "| 1077175 | 1313524 | 2400 | 2400 | 2400 | 36 months |\n", "| 1076863 | 1277178 | 10000 | 10000 | 10000 | 36 months |\n", "| 1075269 | 1311441 | 5000 | 5000 | 5000 | 36 months |\n", "| 1072053 | 1288686 | 3000 | 3000 | 3000 | 36 months |\n", "| 1071795 | 1306957 | 5600 | 5600 | 5600 | 60 months |\n", "| 1071570 | 1306721 | 5375 | 5375 | 5350 | 60 months |\n", "| 1070078 | 1305201 | 6500 | 6500 | 6500 | 60 months |\n", "| 1069908 | 1305008 | 12000 | 12000 | 12000 | 36 months |\n", "+---------+-----------+-----------+-------------+-----------------+------------+\n", "+----------+-------------+-------+-----------+-----------------------+------------+\n", "| int_rate | installment | grade | sub_grade | emp_title | emp_length |\n", "+----------+-------------+-------+-----------+-----------------------+------------+\n", "| 10.65 | 162.87 | B | B2 | | 10+ years |\n", "| 15.27 | 59.83 | C | C4 | Ryder | < 1 year |\n", "| 15.96 | 84.33 | C | C5 | | 10+ years |\n", "| 13.49 | 339.31 | C | C1 | AIR RESOURCES BOARD | 10+ years |\n", "| 7.9 | 156.46 | A | A4 | Veolia Transportaton | 3 years |\n", "| 18.64 | 109.43 | E | E1 | MKC Accounting | 9 years |\n", "| 21.28 | 152.39 | F | F2 | | 4 years |\n", "| 12.69 | 121.45 | B | B5 | Starbucks | < 1 year |\n", "| 14.65 | 153.45 | C | C3 | Southwest Rural metro | 5 years |\n", "| 12.69 | 402.54 | B | B5 | UCLA | 10+ years |\n", "+----------+-------------+-------+-----------+-----------------------+------------+\n", "+----------------+------------+-----------------+-----------------+-------------+\n", "| home_ownership | annual_inc | is_inc_v | issue_d | loan_status |\n", "+----------------+------------+-----------------+-----------------+-------------+\n", "| RENT | 24000 | Verified | 20111201T000000 | Fully Paid |\n", "| RENT | 30000 | Source Verified | 20111201T000000 | Charged Off |\n", "| RENT | 12252 | Not Verified | 20111201T000000 | Fully Paid |\n", "| RENT | 49200 | Source Verified | 20111201T000000 | Fully Paid |\n", "| RENT | 36000 | Source Verified | 20111201T000000 | Fully Paid |\n", "| RENT | 48000 | Source Verified | 20111201T000000 | Fully Paid |\n", "| OWN | 40000 | Source Verified | 20111201T000000 | Charged Off |\n", "| RENT | 15000 | Verified | 20111201T000000 | Charged Off |\n", "| OWN | 72000 | Not Verified | 20111201T000000 | Fully Paid |\n", "| OWN | 75000 | Source Verified | 20111201T000000 | Fully Paid |\n", "+----------------+------------+-----------------+-----------------+-------------+\n", "+------------+-------------------------------+-------------------------------+-----+\n", "| pymnt_plan | url | desc | ... |\n", "+------------+-------------------------------+-------------------------------+-----+\n", "| n | https://www.lendingclub.co... | Borrower added on 12/22/... | ... |\n", "| n | https://www.lendingclub.co... | Borrower added on 12/22/... | ... |\n", "| n | https://www.lendingclub.co... | | ... |\n", "| n | https://www.lendingclub.co... | Borrower added on 12/21/... | ... |\n", "| n | https://www.lendingclub.co... | | ... |\n", "| n | https://www.lendingclub.co... | Borrower added on 12/16/... | ... |\n", "| n | https://www.lendingclub.co... | Borrower added on 12/21/... | ... |\n", "| n | https://www.lendingclub.co... | Borrower added on 12/16/... | ... |\n", "| n | https://www.lendingclub.co... | Borrower added on 12/15/... | ... |\n", "| n | https://www.lendingclub.co... | | ... |\n", "+------------+-------------------------------+-------------------------------+-----+\n", "[10 rows x 68 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "loans.head()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# safe_loans = 1 => safe\n", "# safe_loans = -1 => risky\n", "loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x==0 else -1)\n", "loans = loans.remove_column('bad_loans')" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "application/javascript": [ "$(\"head\").append($(\"\").attr({\n", " rel: \"stylesheet\",\n", " type: \"text/css\",\n", " href: \"//cdnjs.cloudflare.com/ajax/libs/font-awesome/4.1.0/css/font-awesome.min.css\"\n", "}));\n", "$(\"head\").append($(\"\").attr({\n", " rel: \"stylesheet\",\n", " type: \"text/css\",\n", " href: \"//dato.com/files/canvas/1.8.5/css/canvas.css\"\n", "}));\n", "\n", " (function(){\n", "\n", " var e = null;\n", " if (typeof element == 'undefined') {\n", " var scripts = document.getElementsByTagName('script');\n", " var thisScriptTag = scripts[scripts.length-1];\n", " var parentDiv = thisScriptTag.parentNode;\n", " e = document.createElement('div');\n", " parentDiv.appendChild(e);\n", " } else {\n", " e = element[0];\n", " }\n", "\n", " if (typeof requirejs !== 'undefined') {\n", " // disable load timeout; ipython_app.js is large and can take a while to load.\n", " requirejs.config({waitSeconds: 0});\n", " }\n", "\n", " require(['//dato.com/files/canvas/1.8.5/js/ipython_app.js'], function(IPythonApp){\n", " var app = new IPythonApp();\n", " app.attachView('sarray','Categorical', {\"ipython\": true, \"sketch\": {\"std\": 0.7827226562473063, \"complete\": true, \"min\": -1.0, \"max\": 1.0, \"quantile\": [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], \"median\": 1.0, \"numeric\": true, \"num_unique\": 2, \"num_undefined\": 0, \"var\": 0.6126547566028389, \"progress\": 1.0, \"size\": 122607, \"frequent_items\": {\"1\": {\"frequency\": 99457, \"value\": 1}, \"-1\": {\"frequency\": 23150, \"value\": -1}}, \"mean\": 0.6223706639914548}, \"selected_variable\": {\"name\": [\"\"], \"dtype\": \"int\", \"view_component\": \"Categorical\", \"view_file\": \"sarray\", \"descriptives\": {\"rows\": 122607}, \"type\": \"SArray\", \"view_components\": [\"Numeric\", \"Categorical\"]}, \"histogram\": {\"progress\": 1.0, \"histogram\": {\"max\": 1.0, \"bins\": [23150, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 99457], \"min\": -1.0}, \"min\": -1, \"complete\": 1, \"max\": 1}}, e);\n", " });\n", " })();\n", " " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "loans['safe_loans'].show(view = 'Categorical')" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": true }, "outputs": [], "source": [ "features = ['grade', # grade of the loan\n", " 'sub_grade', # sub-grade of the loan\n", " 'short_emp', # one year or less of employment\n", " 'emp_length_num', # number of years of employment\n", " 'home_ownership', # home_ownership status: own, mortgage or rent\n", " 'dti', # debt to income ratio\n", " 'purpose', # the purpose of the loan\n", " 'term', # the term of the loan\n", " 'last_delinq_none', # has borrower had a delinquincy\n", " 'last_major_derog_none', # has borrower had 90 day or worse rating\n", " 'revol_util', # percent of available credit being used\n", " 'total_rec_late_fee', # total late fees received to day\n", " ]\n", "\n", "target = 'safe_loans' # prediction target (y) (+1 means safe, -1 is risky)\n", "\n", "# Extract the feature columns and target column\n", "loans = loans[features + [target]]" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of safe loans : 99457\n", "Number of risky loans : 23150\n" ] } ], "source": [ "safe_loans_raw = loans[loans[target] == +1]\n", "risky_loans_raw = loans[loans[target] == -1]\n", "print \"Number of safe loans : %s\" % len(safe_loans_raw)\n", "print \"Number of risky loans : %s\" % len(risky_loans_raw)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Percentage of safe loans : 0.811185331996\n", "Percentage of risky loans : 0.188814668004\n" ] } ], "source": [ "print \"Percentage of safe loans :\", (len(safe_loans_raw)/float(len(safe_loans_raw) + len(risky_loans_raw)))\n", "print \"Percentage of risky loans :\", (len(risky_loans_raw)/float(len(safe_loans_raw) + len(risky_loans_raw)))" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Since there are fewer risky loans than safe loans, find the ratio of the sizes\n", "# and use that percentage to undersample the safe loans.\n", "percentage = len(risky_loans_raw)/float(len(safe_loans_raw))\n", "\n", "risky_loans = risky_loans_raw\n", "safe_loans = safe_loans_raw.sample(percentage, seed=1)\n", "\n", "# Append the risky_loans with the downsampled version of safe_loans\n", "loans_data = risky_loans.append(safe_loans)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Percentage of safe loans : 0.502236174422\n", "Percentage of risky loans : 0.497763825578\n", "Total number of loans in our new dataset : 46508\n" ] } ], "source": [ "print \"Percentage of safe loans :\", len(safe_loans) / float(len(loans_data))\n", "print \"Percentage of risky loans :\", len(risky_loans) / float(len(loans_data))\n", "print \"Total number of loans in our new dataset :\", len(loans_data)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "37224\n", "9284\n" ] } ], "source": [ "train_data, test_data = loans_data.random_split(.8, seed=1)\n", "print len(train_data)\n", "print len(test_data)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false }, "outputs": [ { "ename": "TypeError", "evalue": "Invalid key type: must be str, bytes or type", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m Logistic_Regression_Model = graphlab.logistic_classifier.create(train_data,\n\u001b[0;32m 2\u001b[0m \u001b[0mtarget\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mloans_data\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m validation_set=None) \n\u001b[0m", "\u001b[1;32mC:\\Anaconda2\\lib\\site-packages\\graphlab\\toolkits\\classifier\\logistic_classifier.pyc\u001b[0m in \u001b[0;36mcreate\u001b[1;34m(dataset, target, features, l2_penalty, l1_penalty, solver, feature_rescaling, convergence_threshold, step_size, lbfgs_memory_level, max_iterations, class_weights, validation_set, verbose)\u001b[0m\n\u001b[0;32m 306\u001b[0m \u001b[0mlbfgs_memory_level\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlbfgs_memory_level\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 307\u001b[0m \u001b[0mmax_iterations\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmax_iterations\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 308\u001b[1;33m class_weights = class_weights)\n\u001b[0m\u001b[0;32m 309\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 310\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mLogisticClassifier\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__proxy__\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32mC:\\Anaconda2\\lib\\site-packages\\graphlab\\toolkits\\_supervised_learning.pyc\u001b[0m in \u001b[0;36mcreate\u001b[1;34m(dataset, target, model_name, features, validation_set, verbose, distributed, **kwargs)\u001b[0m\n\u001b[0;32m 398\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 399\u001b[0m \u001b[1;31m# Target\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 400\u001b[1;33m \u001b[0mtarget_sframe\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_toolkits_select_columns\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mtarget\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 401\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 402\u001b[0m \u001b[1;31m# Features\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32mC:\\Anaconda2\\lib\\site-packages\\graphlab\\toolkits\\_internal_utils.pyc\u001b[0m in \u001b[0;36m_toolkits_select_columns\u001b[1;34m(dataset, columns)\u001b[0m\n\u001b[0;32m 296\u001b[0m \"\"\"\n\u001b[0;32m 297\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 298\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mdataset\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mselect_columns\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 299\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 300\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mRuntimeError\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32mC:\\Anaconda2\\lib\\site-packages\\graphlab\\data_structures\\sframe.pyc\u001b[0m in \u001b[0;36mselect_columns\u001b[1;34m(self, keylist)\u001b[0m\n\u001b[0;32m 3630\u001b[0m if not (all([isinstance(x, str) or isinstance(x, type) or isinstance(x, bytes)\n\u001b[0;32m 3631\u001b[0m for x in keylist])):\n\u001b[1;32m-> 3632\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Invalid key type: must be str, bytes or type\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3633\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3634\u001b[0m \u001b[0mcolumn_names_set\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mset\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcolumn_names\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;31mTypeError\u001b[0m: Invalid key type: must be str, bytes or type" ] } ], "source": [ "Logistic_Regression_Model = graphlab.logistic_classifier.create(train_data,\n", " target = loans_data,\n", " validation_set=None) " ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.11" } }, "nbformat": 4, "nbformat_minor": 0 }