{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "import pandas as pd\n", "data = pd.read_csv(\"train.csv\")" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "len(data)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "data.columns" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "data.Insult.value_counts()" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "import numpy as np\n", "y_train = np.array(data.Insult)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "y_train" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "text_train = data.Comment.tolist()" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "data_test = pd.read_csv(\"test_with_solutions.csv\")" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "data_test" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "text_test, y_test = data_test.Comment.tolist(), np.array(data_test.Insult)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.feature_extraction.text import CountVectorizer" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "cv = CountVectorizer()\n", "cv.fit(text_train)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "len(cv.vocabulary_)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": true, "input": [ "cv.vocabulary_" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "X_train = cv.transform(text_train)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "X_train.shape" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "text_train[6]" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "X_train[6, :].nonzero()" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "X_train[6]" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "X_test = cv.transform(text_test)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.svm import LinearSVC\n", "svm = LinearSVC(C=.01)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "svm.fit(X_train, y_train)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "svm.score(X_train, y_train)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "svm.score(X_test, y_test)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "y_test_pred = svm.predict(X_test)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.metrics import classification_report" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "print(classification_report(y_test, y_test_pred))" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "coef = svm.coef_.ravel()\n", "positive_coefficients = np.argsort(coef)[-25:]\n", "negative_coefficients = np.argsort(coef)[:25]\n", "interesting_coefficients = np.hstack([negative_coefficients, positive_coefficients])\n" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "\n", "plt.figure(figsize=(15, 5))\n", "plt.bar(np.arange(50), coef[interesting_coefficients], color=[\"red\" if c < 0 else \"blue\" for c in coef[interesting_coefficients]])\n", "feature_names = np.array(cv.get_feature_names())\n", "plt.xticks(np.arange(1, 51), feature_names[interesting_coefficients], rotation=60, ha=\"right\");" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.pipeline import Pipeline" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "pipeline = Pipeline([('vectorizer', cv), ('classifier', svm)])" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "pipeline.fit(text_train, y_train)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "pipeline.score(text_train, y_train)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "pipeline.score(text_test, y_test)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.grid_search import GridSearchCV" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "param_grid = {'classifier__C': 10. ** np.arange(-3, 3)}\n", "grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=10)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "grid_search.fit(text_train, y_train)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "grid_search.best_score_" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "grid_search.best_params_" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "param_grid = {'classifier__C': 10. ** np.arange(-3, 3), \"vectorizer__ngram_range\": [(1, 1), (1, 2), (1, 3), (2, 3), (2, 2)]}\n", "grid_search = GridSearchCV(pipeline, param_grid=param_grid, n_jobs=3)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "grid_search.fit(text_train, y_train)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "grid_search.best_params_" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "grid_search.best_score_" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Tasks\n", "======\n", "1. Remove the above visualization code for the coefficients and try to recreate it.\n", "2. Can you think of any other useful features for this task?" ] } ], "metadata": {} } ] }