{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Exercise 9\n", "\n", "## Mashable news stories analysis\n", "\n", "Predicting if a news story is going to be popular\n" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
urltimedeltan_tokens_titlen_tokens_contentn_unique_tokensn_non_stop_wordsn_non_stop_unique_tokensnum_hrefsnum_self_hrefsnum_imgs...min_positive_polaritymax_positive_polarityavg_negative_polaritymin_negative_polaritymax_negative_polaritytitle_subjectivitytitle_sentiment_polarityabs_title_subjectivityabs_title_sentiment_polarityPopular
0http://mashable.com/2014/12/10/cia-torture-rep...28.09.0188.00.7326201.00.8442625.01.01.0...0.2000000.80-0.487500-0.60-0.2500000.90.80.40.81
1http://mashable.com/2013/10/18/bitlock-kicksta...447.07.0297.00.6531991.00.8157899.04.01.0...0.1600000.50-0.135340-0.40-0.0500000.1-0.10.40.10
2http://mashable.com/2013/07/24/google-glass-po...533.011.0181.00.6603771.00.7757014.03.01.0...0.1363641.000.0000000.000.0000000.31.00.21.00
3http://mashable.com/2013/11/21/these-are-the-m...413.012.0781.00.4974091.00.67735010.03.01.0...0.1000001.00-0.195701-0.40-0.0714290.00.00.50.00
4http://mashable.com/2014/02/11/parking-ticket-...331.08.0177.00.6857141.00.8303573.02.01.0...0.1000000.55-0.175000-0.25-0.1000000.00.00.50.00
\n", "

5 rows × 61 columns

\n", "
" ], "text/plain": [ " url timedelta \\\n", "0 http://mashable.com/2014/12/10/cia-torture-rep... 28.0 \n", "1 http://mashable.com/2013/10/18/bitlock-kicksta... 447.0 \n", "2 http://mashable.com/2013/07/24/google-glass-po... 533.0 \n", "3 http://mashable.com/2013/11/21/these-are-the-m... 413.0 \n", "4 http://mashable.com/2014/02/11/parking-ticket-... 331.0 \n", "\n", " n_tokens_title n_tokens_content n_unique_tokens n_non_stop_words \\\n", "0 9.0 188.0 0.732620 1.0 \n", "1 7.0 297.0 0.653199 1.0 \n", "2 11.0 181.0 0.660377 1.0 \n", "3 12.0 781.0 0.497409 1.0 \n", "4 8.0 177.0 0.685714 1.0 \n", "\n", " n_non_stop_unique_tokens num_hrefs num_self_hrefs num_imgs ... \\\n", "0 0.844262 5.0 1.0 1.0 ... \n", "1 0.815789 9.0 4.0 1.0 ... \n", "2 0.775701 4.0 3.0 1.0 ... \n", "3 0.677350 10.0 3.0 1.0 ... \n", "4 0.830357 3.0 2.0 1.0 ... \n", "\n", " min_positive_polarity max_positive_polarity avg_negative_polarity \\\n", "0 0.200000 0.80 -0.487500 \n", "1 0.160000 0.50 -0.135340 \n", "2 0.136364 1.00 0.000000 \n", "3 0.100000 1.00 -0.195701 \n", "4 0.100000 0.55 -0.175000 \n", "\n", " min_negative_polarity max_negative_polarity title_subjectivity \\\n", "0 -0.60 -0.250000 0.9 \n", "1 -0.40 -0.050000 0.1 \n", "2 0.00 0.000000 0.3 \n", "3 -0.40 -0.071429 0.0 \n", "4 -0.25 -0.100000 0.0 \n", "\n", " title_sentiment_polarity abs_title_subjectivity \\\n", "0 0.8 0.4 \n", "1 -0.1 0.4 \n", "2 1.0 0.2 \n", "3 0.0 0.5 \n", "4 0.0 0.5 \n", "\n", " abs_title_sentiment_polarity Popular \n", "0 0.8 1 \n", "1 0.1 0 \n", "2 1.0 0 \n", "3 0.0 0 \n", "4 0.0 0 \n", "\n", "[5 rows x 61 columns]" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "url = 'https://raw.githubusercontent.com/albahnsen/PracticalMachineLearningClass/master/datasets/mashable.csv'\n", "df = pd.read_csv(url, index_col=0)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(6000, 61)" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_df.shape" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "X = train_df.drop(['url', 'Popular'], axis=1)\n", "y = train_df['Popular']" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.5" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y.mean()" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "# train/test split\n", "from sklearn.model_selection import train_test_split\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Exercise 9.1\n", "\n", "Estimate a Decision Tree Classifier and a Logistic Regression\n", "\n", "Evaluate using the following metrics:\n", "* Accuracy\n", "* F1-Score" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Exercise 9.2\n", "\n", "Estimate 300 bagged samples\n", "\n", "Estimate the following set of classifiers:\n", "\n", "* 100 Decision Trees where max_depth=None\n", "* 100 Decision Trees where max_depth=2\n", "* 100 Logistic Regressions" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Exercise 9.3\n", "\n", "Ensemble using majority voting\n", "\n", "Evaluate using the following metrics:\n", "* Accuracy\n", "* F1-Score" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Exercise 9.4\n", "\n", "Estimate te probability as %models that predict positive\n", "\n", "Modify the probability threshold and select the one that maximizes the F1-Score" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Exercise 9.5\n", "\n", "Ensemble using weighted voting using the oob_error\n", "\n", "Evaluate using the following metrics:\n", "* Accuracy\n", "* F1-Score" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Exercise 9.6\n", "\n", "Estimate te probability of the weighted voting\n", "\n", "Modify the probability threshold and select the one that maximizes the F1-Score" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Exercise 9.7\n", "\n", "Estimate a logistic regression using as input the estimated classifiers\n", "\n", "Modify the probability threshold such that maximizes the F1-Score" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.0" } }, "nbformat": 4, "nbformat_minor": 1 }