{ "cells": [ { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# import\n", "import pandas as pd\n", "import numpy as np\n", "\n", "import matplotlib.pyplot as plt\n", "\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.linear_model.logistic import LogisticRegression\n", "from sklearn.grid_search import GridSearchCV\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.model_selection import train_test_split, cross_val_score\n", "from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, precision_score\n", "\n", "%matplotlib inline" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Lets revise" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012345678
0-0.330.6901100.800.88
1-0.330.9410100.810.31
2-0.330.5010001.0-10.50
3-0.330.7501101.0-10.38
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7 8\n", "0 -0.33 0.69 0 1 1 0 0.8 0 0.88\n", "1 -0.33 0.94 1 0 1 0 0.8 1 0.31\n", "2 -0.33 0.50 1 0 0 0 1.0 -1 0.50\n", "3 -0.33 0.75 0 1 1 0 1.0 -1 0.38" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# reading the data\n", "df = pd.read_csv(\"data/fertility_Diagnosis.txt\", delimiter=',', header=None)\n", "df.iloc[:4,0:9]" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,0:9], df[9], test_size=0.1)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Fitting 5 folds for each of 8 candidates, totalling 40 fits\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "[Parallel(n_jobs=5)]: Done 31 out of 40 | elapsed: 5.8s remaining: 1.6s\n", "[Parallel(n_jobs=5)]: Done 40 out of 40 | elapsed: 5.8s finished\n" ] }, { "data": { "text/plain": [ "GridSearchCV(cv=5, error_score='raise',\n", " estimator=Pipeline(steps=[('clf', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", " intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n", " penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n", " verbose=0, warm_start=False))]),\n", " fit_params={}, iid=True, n_jobs=5,\n", " param_grid={'clf__C': (0.01, 0.1, 1, 10), 'clf__penalty': ('l1', 'l2')},\n", " pre_dispatch='2*n_jobs', refit=True, scoring='accuracy',\n", " verbose=True)" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipeline = Pipeline([('clf', LogisticRegression())])\n", "\n", "parameters = {\n", " 'clf__penalty': ('l1', 'l2'),\n", " 'clf__C': (0.01, 0.1, 1, 10) \n", " }\n", "\n", "grid_search = GridSearchCV(pipeline, parameters, n_jobs=5, verbose=True, scoring='accuracy', cv = 5)\n", "grid_search.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Best score: 0.933\n", "Best parameters set:\n", "\tclf__C: 0.01\n", "\tclf__penalty: 'l1'\n" ] } ], "source": [ "print( 'Best score: %0.3f' % grid_search.best_score_)\n", "print( 'Best parameters set:')\n", "\n", "best_parameters = grid_search.best_estimator_.get_params()\n", "\n", "for param_name in sorted(parameters.keys()):\n", " print( '\\t%s: %r' % (param_name, best_parameters[param_name]))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.8\n", "Precision: 0.0\n", "Recall: 0.0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.\n", " 'precision', 'predicted', average, warn_for)\n" ] } ], "source": [ "y_pred = grid_search.predict(X_test)\n", "\n", "#print((y_pred), (y_test))\n", "\n", "y_test = [2 if x=='N' else 1 for x in y_test]\n", "y_pred = [2 if x=='N' else 1 for x in y_pred]\n", "\n", "#print((y_pred), (y_test))\n", "\n", "print( 'Accuracy:', accuracy_score(y_test, y_pred))\n", "print( 'Precision:', precision_score(y_test, y_pred))\n", "print( 'Recall:', recall_score(y_test, y_pred))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Multi-class classification \n", "\n", "The goal of multi-class classification\n", "is to assign an instance to one of the set of classes. scikit-learn uses a strategy\n", "called one-vs.-all, or one-vs.-the-rest, to support multi-class classification. Onevs.-\n", "all classification uses one binary classifier for each of the possible classes. The\n", "class that is predicted with the greatest confidence is assigned to the instance." ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PhraseIdSentenceIdPhraseSentiment
011A series of escapades demonstrating the adage ...1
121A series of escapades demonstrating the adage ...2
231A series2
341A2
451series2
561of escapades demonstrating the adage that what...2
671of2
781escapades demonstrating the adage that what is...2
891escapades2
9101demonstrating the adage that what is good for ...2
\n", "
" ], "text/plain": [ " PhraseId SentenceId Phrase \\\n", "0 1 1 A series of escapades demonstrating the adage ... \n", "1 2 1 A series of escapades demonstrating the adage ... \n", "2 3 1 A series \n", "3 4 1 A \n", "4 5 1 series \n", "5 6 1 of escapades demonstrating the adage that what... \n", "6 7 1 of \n", "7 8 1 escapades demonstrating the adage that what is... \n", "8 9 1 escapades \n", "9 10 1 demonstrating the adage that what is good for ... \n", "\n", " Sentiment \n", "0 1 \n", "1 2 \n", "2 2 \n", "3 2 \n", "4 2 \n", "5 2 \n", "6 2 \n", "7 2 \n", "8 2 \n", "9 2 " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movie = pd.read_csv(\"data/movie_train.tsv\", delimiter=\"\\t\")\n", "movie[:10]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "count 156060.000000\n", "mean 2.063578\n", "std 0.893832\n", "min 0.000000\n", "25% 2.000000\n", "50% 2.000000\n", "75% 3.000000\n", "max 4.000000\n", "Name: Sentiment, dtype: float64\n" ] } ], "source": [ "print(movie['Sentiment'].describe())" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2 79582\n", "3 32927\n", "1 27273\n", "4 9206\n", "0 7072\n", "Name: Sentiment, dtype: int64\n" ] } ], "source": [ "print(movie['Sentiment'].value_counts())" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Fitting 3 folds for each of 24 candidates, totalling 72 fits\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "[Parallel(n_jobs=2)]: Done 46 tasks | elapsed: 2.5min\n", "[Parallel(n_jobs=2)]: Done 72 out of 72 | elapsed: 5.7min finished\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Best score: 0.631\n", "Best parameters set:\n", "\tclf__C: 10\n", "\tvect__max_df: 0.25\n", "\tvect__ngram_range: (1, 2)\n", "\tvect__use_idf: False\n", "Accuracy: 0.651159810329\n", "Confusion Matrix: [[ 740 1022 287 24 2]\n", " [ 526 3854 3568 289 10]\n", " [ 120 1869 19712 2057 82]\n", " [ 10 248 3740 5096 780]\n", " [ 4 11 248 1435 1084]]\n", "Classification Report: precision recall f1-score support\n", "\n", " 0 0.53 0.36 0.43 2075\n", " 1 0.55 0.47 0.51 8247\n", " 2 0.72 0.83 0.77 23840\n", " 3 0.57 0.52 0.54 9874\n", " 4 0.55 0.39 0.46 2782\n", "\n", "avg / total 0.64 0.65 0.64 46818\n", "\n" ] } ], "source": [ "def movie_rank():\n", " \n", " pipeline = Pipeline([('vect', TfidfVectorizer(stop_words='english')),\n", " ('clf', LogisticRegression()) \n", " ])\n", " \n", " parameters = {'vect__max_df': (0.25, 0.5),\n", " 'vect__ngram_range': ((1, 1), (1, 2)),\n", " 'vect__use_idf': (True, False),\n", " 'clf__C': (0.1, 1, 10),}\n", " \n", " movie=pd.read_csv('data/movie_train.tsv', header=0, delimiter='\\t')\n", " X, y = movie['Phrase'], movie['Sentiment'].as_matrix()\n", " #print(X[:3])\n", " #print(y[:3])\n", "\n", " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 19)\n", " #print(X_train[:3])\n", " #print(y_train[:3])\n", "\n", " grid_search = GridSearchCV(pipeline, parameters, n_jobs=2, verbose=1, scoring='accuracy')\n", " grid_search.fit(X_train, y_train)\n", " \n", " \n", " print( 'Best score: %0.3f' % grid_search.best_score_)\n", " print( 'Best parameters set:')\n", " best_parameters = grid_search.best_estimator_.get_params()\n", " for param_name in sorted(parameters.keys()):\n", " print( '\\t%s: %r' % (param_name, best_parameters[param_name]))\n", " \n", " predictions = grid_search.predict(X_test)\n", "\n", " print ('Accuracy:', accuracy_score(y_test, predictions))\n", " print ('Confusion Matrix:', confusion_matrix(y_test, predictions))\n", " print ('Classification Report:', classification_report(y_test, predictions))\n", "\n", "movie_rank()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 1 }