{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Spam classification using logistic regression\n", "\n", "We have 4601 email addresses with 57 features. The data can be found on [GitHub](https://github.com/probml/pmtk3/tree/master/data/spamData). The column names can be found [here](ftp://ftp.ics.uci.edu/pub/machine-learning-databases/spambase/spambase.names). We want to predict whether the email is spam or not, so we have a binary response, where 1 indicates spam and 0 indicates non-spam.\n", "\n", "You can find some utility functions on [GitHub](https://github.com/ppham27/MLaPP-solutions/blob/master/chap08/classifiers.py)." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", " | spam | \n", "word_freq_make | \n", "word_freq_address | \n", "word_freq_all | \n", "word_freq_3d | \n", "word_freq_our | \n", "word_freq_over | \n", "word_freq_remove | \n", "word_freq_internet | \n", "word_freq_order | \n", "... | \n", "word_freq_conference | \n", "char_freq_; | \n", "char_freq_( | \n", "char_freq_[ | \n", "char_freq_! | \n", "char_freq_$ | \n", "char_freq_# | \n", "capital_run_length_average | \n", "capital_run_length_longest | \n", "capital_run_length_total | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "1 | \n", "0.21 | \n", "0.28 | \n", "0.5 | \n", "0.0 | \n", "0.14 | \n", "0.28 | \n", "0.21 | \n", "0.07 | \n", "0.00 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.132 | \n", "0.0 | \n", "0.372 | \n", "0.180 | \n", "0.048 | \n", "5.114 | \n", "101.0 | \n", "1028.0 | \n", "
1 | \n", "1 | \n", "0.00 | \n", "0.00 | \n", "0.0 | \n", "0.0 | \n", "0.63 | \n", "0.00 | \n", "0.31 | \n", "0.63 | \n", "0.31 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.137 | \n", "0.0 | \n", "0.137 | \n", "0.000 | \n", "0.000 | \n", "3.537 | \n", "40.0 | \n", "191.0 | \n", "
2 | \n", "1 | \n", "0.00 | \n", "0.00 | \n", "0.0 | \n", "0.0 | \n", "0.63 | \n", "0.00 | \n", "0.31 | \n", "0.63 | \n", "0.31 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.135 | \n", "0.0 | \n", "0.135 | \n", "0.000 | \n", "0.000 | \n", "3.537 | \n", "40.0 | \n", "191.0 | \n", "
3 | \n", "1 | \n", "0.00 | \n", "0.00 | \n", "0.0 | \n", "0.0 | \n", "1.85 | \n", "0.00 | \n", "0.00 | \n", "1.85 | \n", "0.00 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.223 | \n", "0.0 | \n", "0.000 | \n", "0.000 | \n", "0.000 | \n", "3.000 | \n", "15.0 | \n", "54.0 | \n", "
4 | \n", "1 | \n", "0.00 | \n", "0.00 | \n", "0.0 | \n", "0.0 | \n", "1.92 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.054 | \n", "0.0 | \n", "0.164 | \n", "0.054 | \n", "0.000 | \n", "1.671 | \n", "4.0 | \n", "112.0 | \n", "
5 rows × 58 columns
\n", "\n", " | Regularization | \n", "Raw Train | \n", "Standard Train | \n", "Log Train | \n", "Binary Train | \n", "Raw Test | \n", "Standard Test | \n", "Log Test | \n", "Binary Test | \n", "
---|---|---|---|---|---|---|---|---|---|
0 | \n", "0.0 | \n", "0.070799 | \n", "0.070147 | \n", "0.051876 | \n", "0.064600 | \n", "0.076172 | \n", "0.086589 | \n", "0.058594 | \n", "0.074219 | \n", "
1 | \n", "0.5 | \n", "0.069494 | \n", "0.072431 | \n", "0.051223 | \n", "0.064274 | \n", "0.070964 | \n", "0.087891 | \n", "0.059245 | \n", "0.072266 | \n", "
2 | \n", "1.0 | \n", "0.069821 | \n", "0.074062 | \n", "0.050571 | \n", "0.063295 | \n", "0.070312 | \n", "0.087240 | \n", "0.059896 | \n", "0.072266 | \n", "
3 | \n", "1.5 | \n", "0.071126 | \n", "0.075367 | \n", "0.049918 | \n", "0.063295 | \n", "0.070312 | \n", "0.085938 | \n", "0.060547 | \n", "0.072917 | \n", "
4 | \n", "2.0 | \n", "0.072757 | \n", "0.076020 | \n", "0.050245 | \n", "0.063295 | \n", "0.069661 | \n", "0.084635 | \n", "0.061198 | \n", "0.073568 | \n", "
5 | \n", "2.5 | \n", "0.073409 | \n", "0.075367 | \n", "0.050571 | \n", "0.063295 | \n", "0.071615 | \n", "0.085286 | \n", "0.059896 | \n", "0.074219 | \n", "
6 | \n", "3.0 | \n", "0.072757 | \n", "0.075693 | \n", "0.051876 | \n", "0.063948 | \n", "0.072266 | \n", "0.084635 | \n", "0.059896 | \n", "0.074219 | \n", "
7 | \n", "3.5 | \n", "0.074715 | \n", "0.075367 | \n", "0.051223 | \n", "0.063295 | \n", "0.075521 | \n", "0.085286 | \n", "0.059896 | \n", "0.074219 | \n", "
8 | \n", "4.0 | \n", "0.075693 | \n", "0.076020 | \n", "0.051223 | \n", "0.063622 | \n", "0.075521 | \n", "0.083984 | \n", "0.059896 | \n", "0.074219 | \n", "
9 | \n", "4.5 | \n", "0.076020 | \n", "0.076020 | \n", "0.051223 | \n", "0.063295 | \n", "0.076823 | \n", "0.084635 | \n", "0.059896 | \n", "0.074219 | \n", "
10 | \n", "5.0 | \n", "0.075041 | \n", "0.076020 | \n", "0.050897 | \n", "0.063622 | \n", "0.076823 | \n", "0.085286 | \n", "0.059896 | \n", "0.074870 | \n", "
11 | \n", "5.5 | \n", "0.074062 | \n", "0.076346 | \n", "0.050897 | \n", "0.063622 | \n", "0.077474 | \n", "0.085938 | \n", "0.059896 | \n", "0.075521 | \n", "
12 | \n", "6.0 | \n", "0.074388 | \n", "0.076998 | \n", "0.050571 | \n", "0.063948 | \n", "0.076823 | \n", "0.085286 | \n", "0.059896 | \n", "0.076172 | \n", "
13 | \n", "6.5 | \n", "0.074388 | \n", "0.076998 | \n", "0.050245 | \n", "0.064600 | \n", "0.075521 | \n", "0.083984 | \n", "0.058594 | \n", "0.076172 | \n", "
14 | \n", "7.0 | \n", "0.074388 | \n", "0.077651 | \n", "0.050245 | \n", "0.064600 | \n", "0.076172 | \n", "0.084635 | \n", "0.058594 | \n", "0.076172 | \n", "
15 | \n", "7.5 | \n", "0.074388 | \n", "0.078303 | \n", "0.050897 | \n", "0.064600 | \n", "0.076172 | \n", "0.084635 | \n", "0.057943 | \n", "0.076172 | \n", "
16 | \n", "8.0 | \n", "0.074388 | \n", "0.078303 | \n", "0.051550 | \n", "0.064927 | \n", "0.076823 | \n", "0.083984 | \n", "0.057943 | \n", "0.076172 | \n", "
17 | \n", "8.5 | \n", "0.074715 | \n", "0.078630 | \n", "0.051550 | \n", "0.064927 | \n", "0.077474 | \n", "0.083984 | \n", "0.059245 | \n", "0.075521 | \n", "
18 | \n", "9.0 | \n", "0.075367 | \n", "0.078630 | \n", "0.051876 | \n", "0.064600 | \n", "0.076823 | \n", "0.083984 | \n", "0.059245 | \n", "0.075521 | \n", "
19 | \n", "9.5 | \n", "0.074715 | \n", "0.079282 | \n", "0.051876 | \n", "0.065253 | \n", "0.076823 | \n", "0.083984 | \n", "0.059245 | \n", "0.075521 | \n", "
20 | \n", "10.0 | \n", "0.074715 | \n", "0.078956 | \n", "0.051876 | \n", "0.064600 | \n", "0.076823 | \n", "0.085286 | \n", "0.059245 | \n", "0.075521 | \n", "
21 | \n", "10.5 | \n", "0.074715 | \n", "0.078956 | \n", "0.052202 | \n", "0.064600 | \n", "0.076823 | \n", "0.084635 | \n", "0.059245 | \n", "0.075521 | \n", "
22 | \n", "11.0 | \n", "0.074388 | \n", "0.079282 | \n", "0.051550 | \n", "0.065253 | \n", "0.077474 | \n", "0.084635 | \n", "0.059245 | \n", "0.074219 | \n", "
23 | \n", "11.5 | \n", "0.075367 | \n", "0.079282 | \n", "0.051223 | \n", "0.066232 | \n", "0.077474 | \n", "0.084635 | \n", "0.059245 | \n", "0.074870 | \n", "
24 | \n", "12.0 | \n", "0.075367 | \n", "0.079608 | \n", "0.051550 | \n", "0.069168 | \n", "0.077474 | \n", "0.084635 | \n", "0.059245 | \n", "0.075521 | \n", "
25 | \n", "12.5 | \n", "0.075693 | \n", "0.079608 | \n", "0.051550 | \n", "0.069821 | \n", "0.077474 | \n", "0.085286 | \n", "0.059245 | \n", "0.074219 | \n", "
26 | \n", "13.0 | \n", "0.075693 | \n", "0.079608 | \n", "0.051550 | \n", "0.070147 | \n", "0.076823 | \n", "0.085286 | \n", "0.059245 | \n", "0.074219 | \n", "
27 | \n", "13.5 | \n", "0.075693 | \n", "0.079608 | \n", "0.051550 | \n", "0.070147 | \n", "0.076823 | \n", "0.085286 | \n", "0.059896 | \n", "0.074219 | \n", "
28 | \n", "14.0 | \n", "0.076020 | \n", "0.080587 | \n", "0.052202 | \n", "0.070147 | \n", "0.076823 | \n", "0.085286 | \n", "0.059896 | \n", "0.075521 | \n", "
29 | \n", "14.5 | \n", "0.076346 | \n", "0.080587 | \n", "0.052202 | \n", "0.069821 | \n", "0.077474 | \n", "0.085286 | \n", "0.060547 | \n", "0.075521 | \n", "
30 | \n", "15.0 | \n", "0.076346 | \n", "0.080587 | \n", "0.051876 | \n", "0.069821 | \n", "0.079427 | \n", "0.085286 | \n", "0.060547 | \n", "0.075521 | \n", "
31 | \n", "15.5 | \n", "0.076346 | \n", "0.080914 | \n", "0.051876 | \n", "0.070473 | \n", "0.079427 | \n", "0.085286 | \n", "0.060547 | \n", "0.078125 | \n", "
32 | \n", "16.0 | \n", "0.076346 | \n", "0.080914 | \n", "0.052202 | \n", "0.071778 | \n", "0.080078 | \n", "0.085286 | \n", "0.061849 | \n", "0.078776 | \n", "
33 | \n", "16.5 | \n", "0.076346 | \n", "0.081240 | \n", "0.052202 | \n", "0.071778 | \n", "0.080078 | \n", "0.085286 | \n", "0.061849 | \n", "0.078776 | \n", "
34 | \n", "17.0 | \n", "0.077325 | \n", "0.080914 | \n", "0.051876 | \n", "0.071452 | \n", "0.080078 | \n", "0.085938 | \n", "0.061849 | \n", "0.078776 | \n", "
35 | \n", "17.5 | \n", "0.077651 | \n", "0.080914 | \n", "0.051876 | \n", "0.071452 | \n", "0.080729 | \n", "0.085938 | \n", "0.062500 | \n", "0.078776 | \n", "
36 | \n", "18.0 | \n", "0.077651 | \n", "0.080914 | \n", "0.051550 | \n", "0.071452 | \n", "0.080729 | \n", "0.085938 | \n", "0.062500 | \n", "0.078776 | \n", "
37 | \n", "18.5 | \n", "0.077651 | \n", "0.081240 | \n", "0.051550 | \n", "0.071452 | \n", "0.080729 | \n", "0.085938 | \n", "0.062500 | \n", "0.079427 | \n", "
38 | \n", "19.0 | \n", "0.077651 | \n", "0.081566 | \n", "0.051876 | \n", "0.071452 | \n", "0.081380 | \n", "0.086589 | \n", "0.062500 | \n", "0.079427 | \n", "
39 | \n", "19.5 | \n", "0.077651 | \n", "0.081566 | \n", "0.052529 | \n", "0.071778 | \n", "0.081380 | \n", "0.086589 | \n", "0.062500 | \n", "0.080078 | \n", "
40 | \n", "20.0 | \n", "0.077651 | \n", "0.081566 | \n", "0.052529 | \n", "0.072431 | \n", "0.081380 | \n", "0.086589 | \n", "0.062500 | \n", "0.080078 | \n", "