{
"cells": [
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# import\n",
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"import matplotlib.pyplot as plt\n",
"\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.linear_model.logistic import LogisticRegression\n",
"from sklearn.grid_search import GridSearchCV\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.model_selection import train_test_split, cross_val_score\n",
"from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, precision_score\n",
"\n",
"%matplotlib inline"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Lets revise"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 8 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" -0.33 | \n",
" 0.69 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0.8 | \n",
" 0 | \n",
" 0.88 | \n",
"
\n",
" \n",
" 1 | \n",
" -0.33 | \n",
" 0.94 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0.8 | \n",
" 1 | \n",
" 0.31 | \n",
"
\n",
" \n",
" 2 | \n",
" -0.33 | \n",
" 0.50 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1.0 | \n",
" -1 | \n",
" 0.50 | \n",
"
\n",
" \n",
" 3 | \n",
" -0.33 | \n",
" 0.75 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 1.0 | \n",
" -1 | \n",
" 0.38 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 2 3 4 5 6 7 8\n",
"0 -0.33 0.69 0 1 1 0 0.8 0 0.88\n",
"1 -0.33 0.94 1 0 1 0 0.8 1 0.31\n",
"2 -0.33 0.50 1 0 0 0 1.0 -1 0.50\n",
"3 -0.33 0.75 0 1 1 0 1.0 -1 0.38"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# reading the data\n",
"df = pd.read_csv(\"data/fertility_Diagnosis.txt\", delimiter=',', header=None)\n",
"df.iloc[:4,0:9]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,0:9], df[9], test_size=0.1)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fitting 5 folds for each of 8 candidates, totalling 40 fits\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[Parallel(n_jobs=5)]: Done 31 out of 40 | elapsed: 5.8s remaining: 1.6s\n",
"[Parallel(n_jobs=5)]: Done 40 out of 40 | elapsed: 5.8s finished\n"
]
},
{
"data": {
"text/plain": [
"GridSearchCV(cv=5, error_score='raise',\n",
" estimator=Pipeline(steps=[('clf', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
" intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n",
" penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n",
" verbose=0, warm_start=False))]),\n",
" fit_params={}, iid=True, n_jobs=5,\n",
" param_grid={'clf__C': (0.01, 0.1, 1, 10), 'clf__penalty': ('l1', 'l2')},\n",
" pre_dispatch='2*n_jobs', refit=True, scoring='accuracy',\n",
" verbose=True)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pipeline = Pipeline([('clf', LogisticRegression())])\n",
"\n",
"parameters = {\n",
" 'clf__penalty': ('l1', 'l2'),\n",
" 'clf__C': (0.01, 0.1, 1, 10) \n",
" }\n",
"\n",
"grid_search = GridSearchCV(pipeline, parameters, n_jobs=5, verbose=True, scoring='accuracy', cv = 5)\n",
"grid_search.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Best score: 0.933\n",
"Best parameters set:\n",
"\tclf__C: 0.01\n",
"\tclf__penalty: 'l1'\n"
]
}
],
"source": [
"print( 'Best score: %0.3f' % grid_search.best_score_)\n",
"print( 'Best parameters set:')\n",
"\n",
"best_parameters = grid_search.best_estimator_.get_params()\n",
"\n",
"for param_name in sorted(parameters.keys()):\n",
" print( '\\t%s: %r' % (param_name, best_parameters[param_name]))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy: 0.8\n",
"Precision: 0.0\n",
"Recall: 0.0\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.\n",
" 'precision', 'predicted', average, warn_for)\n"
]
}
],
"source": [
"y_pred = grid_search.predict(X_test)\n",
"\n",
"#print((y_pred), (y_test))\n",
"\n",
"y_test = [2 if x=='N' else 1 for x in y_test]\n",
"y_pred = [2 if x=='N' else 1 for x in y_pred]\n",
"\n",
"#print((y_pred), (y_test))\n",
"\n",
"print( 'Accuracy:', accuracy_score(y_test, y_pred))\n",
"print( 'Precision:', precision_score(y_test, y_pred))\n",
"print( 'Recall:', recall_score(y_test, y_pred))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Multi-class classification \n",
"\n",
"The goal of multi-class classification\n",
"is to assign an instance to one of the set of classes. scikit-learn uses a strategy\n",
"called one-vs.-all, or one-vs.-the-rest, to support multi-class classification. Onevs.-\n",
"all classification uses one binary classifier for each of the possible classes. The\n",
"class that is predicted with the greatest confidence is assigned to the instance."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" PhraseId | \n",
" SentenceId | \n",
" Phrase | \n",
" Sentiment | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" A series of escapades demonstrating the adage ... | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" 1 | \n",
" A series of escapades demonstrating the adage ... | \n",
" 2 | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" 1 | \n",
" A series | \n",
" 2 | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" 1 | \n",
" A | \n",
" 2 | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" 1 | \n",
" series | \n",
" 2 | \n",
"
\n",
" \n",
" 5 | \n",
" 6 | \n",
" 1 | \n",
" of escapades demonstrating the adage that what... | \n",
" 2 | \n",
"
\n",
" \n",
" 6 | \n",
" 7 | \n",
" 1 | \n",
" of | \n",
" 2 | \n",
"
\n",
" \n",
" 7 | \n",
" 8 | \n",
" 1 | \n",
" escapades demonstrating the adage that what is... | \n",
" 2 | \n",
"
\n",
" \n",
" 8 | \n",
" 9 | \n",
" 1 | \n",
" escapades | \n",
" 2 | \n",
"
\n",
" \n",
" 9 | \n",
" 10 | \n",
" 1 | \n",
" demonstrating the adage that what is good for ... | \n",
" 2 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" PhraseId SentenceId Phrase \\\n",
"0 1 1 A series of escapades demonstrating the adage ... \n",
"1 2 1 A series of escapades demonstrating the adage ... \n",
"2 3 1 A series \n",
"3 4 1 A \n",
"4 5 1 series \n",
"5 6 1 of escapades demonstrating the adage that what... \n",
"6 7 1 of \n",
"7 8 1 escapades demonstrating the adage that what is... \n",
"8 9 1 escapades \n",
"9 10 1 demonstrating the adage that what is good for ... \n",
"\n",
" Sentiment \n",
"0 1 \n",
"1 2 \n",
"2 2 \n",
"3 2 \n",
"4 2 \n",
"5 2 \n",
"6 2 \n",
"7 2 \n",
"8 2 \n",
"9 2 "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movie = pd.read_csv(\"data/movie_train.tsv\", delimiter=\"\\t\")\n",
"movie[:10]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"count 156060.000000\n",
"mean 2.063578\n",
"std 0.893832\n",
"min 0.000000\n",
"25% 2.000000\n",
"50% 2.000000\n",
"75% 3.000000\n",
"max 4.000000\n",
"Name: Sentiment, dtype: float64\n"
]
}
],
"source": [
"print(movie['Sentiment'].describe())"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2 79582\n",
"3 32927\n",
"1 27273\n",
"4 9206\n",
"0 7072\n",
"Name: Sentiment, dtype: int64\n"
]
}
],
"source": [
"print(movie['Sentiment'].value_counts())"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fitting 3 folds for each of 24 candidates, totalling 72 fits\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[Parallel(n_jobs=2)]: Done 46 tasks | elapsed: 2.5min\n",
"[Parallel(n_jobs=2)]: Done 72 out of 72 | elapsed: 5.7min finished\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Best score: 0.631\n",
"Best parameters set:\n",
"\tclf__C: 10\n",
"\tvect__max_df: 0.25\n",
"\tvect__ngram_range: (1, 2)\n",
"\tvect__use_idf: False\n",
"Accuracy: 0.651159810329\n",
"Confusion Matrix: [[ 740 1022 287 24 2]\n",
" [ 526 3854 3568 289 10]\n",
" [ 120 1869 19712 2057 82]\n",
" [ 10 248 3740 5096 780]\n",
" [ 4 11 248 1435 1084]]\n",
"Classification Report: precision recall f1-score support\n",
"\n",
" 0 0.53 0.36 0.43 2075\n",
" 1 0.55 0.47 0.51 8247\n",
" 2 0.72 0.83 0.77 23840\n",
" 3 0.57 0.52 0.54 9874\n",
" 4 0.55 0.39 0.46 2782\n",
"\n",
"avg / total 0.64 0.65 0.64 46818\n",
"\n"
]
}
],
"source": [
"def movie_rank():\n",
" \n",
" pipeline = Pipeline([('vect', TfidfVectorizer(stop_words='english')),\n",
" ('clf', LogisticRegression()) \n",
" ])\n",
" \n",
" parameters = {'vect__max_df': (0.25, 0.5),\n",
" 'vect__ngram_range': ((1, 1), (1, 2)),\n",
" 'vect__use_idf': (True, False),\n",
" 'clf__C': (0.1, 1, 10),}\n",
" \n",
" movie=pd.read_csv('data/movie_train.tsv', header=0, delimiter='\\t')\n",
" X, y = movie['Phrase'], movie['Sentiment'].as_matrix()\n",
" #print(X[:3])\n",
" #print(y[:3])\n",
"\n",
" X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 19)\n",
" #print(X_train[:3])\n",
" #print(y_train[:3])\n",
"\n",
" grid_search = GridSearchCV(pipeline, parameters, n_jobs=2, verbose=1, scoring='accuracy')\n",
" grid_search.fit(X_train, y_train)\n",
" \n",
" \n",
" print( 'Best score: %0.3f' % grid_search.best_score_)\n",
" print( 'Best parameters set:')\n",
" best_parameters = grid_search.best_estimator_.get_params()\n",
" for param_name in sorted(parameters.keys()):\n",
" print( '\\t%s: %r' % (param_name, best_parameters[param_name]))\n",
" \n",
" predictions = grid_search.predict(X_test)\n",
"\n",
" print ('Accuracy:', accuracy_score(y_test, predictions))\n",
" print ('Confusion Matrix:', confusion_matrix(y_test, predictions))\n",
" print ('Classification Report:', classification_report(y_test, predictions))\n",
"\n",
"movie_rank()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 1
}