{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer\n", "from sklearn.metrics import f1_score\n", "from sklearn.model_selection import GridSearchCV\n", "from sklearn.svm import LinearSVC\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.datasets import fetch_20newsgroups\n", "\n", "cats = ['alt.atheism', 'sci.space']\n", "newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)\n", "newsgroups_test = fetch_20newsgroups(subset='test', categories=cats)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "X_train = newsgroups_train.data\n", "X_test = newsgroups_test.data\n", "y_train = newsgroups_train.target\n", "y_test = newsgroups_test.target" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "GridSearchCV(cv=3, error_score='raise',\n", " estimator=Pipeline(memory=None,\n", " steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n", " dtype=, encoding='utf-8', input='content',\n", " lowercase=True, max_df=1.0, max_features=None, min_df=1,\n", " ngram_range=(1, 1), preprocessor=None, stop_words=None,\n", " strip...ax_iter=1000,\n", " multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n", " verbose=0))]),\n", " fit_params=None, iid=True, n_jobs=1,\n", " param_grid=[{'vect__max_df': [0.8, 0.9, 1.0], 'clf__dual': [True, False], 'clf__penalty': ['l2']}, {'vect__max_df': [0.8, 0.9, 1.0], 'clf__dual': [False], 'clf__penalty': ['l1']}],\n", " pre_dispatch='2*n_jobs', refit=True, return_train_score=True,\n", " scoring='f1_micro', verbose=0)" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipeline = Pipeline([\n", " ('vect',CountVectorizer()),\n", " ('tfidf',TfidfTransformer()),\n", " ('clf',LinearSVC())\n", "])\n", "\n", "# this is where you define the values for\n", "# GridSearchCV to iterate over\n", "\n", "# l1 penalty is incompatible with other configs\n", "param_grid = [\n", " {\n", " 'vect__max_df':[0.8,0.9,1.0],\n", " 'clf__penalty':['l2'],\n", " 'clf__dual':[True,False]\n", " },\n", " {\n", " 'vect__max_df':[0.8,0.9,1.0],\n", " 'clf__penalty':['l1'],\n", " 'clf__dual': [False]\n", " }\n", "]\n", "\n", "# do 3-fold cross validation for each of the 6 possible\n", "# combinations of the parameter values above\n", "grid = GridSearchCV(pipeline, cv=3, param_grid=param_grid,scoring='f1_micro')\n", "grid.fit(X_train,y_train)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Best: 0.992544 using {'vect__max_df': 0.9, 'clf__dual': True, 'clf__penalty': 'l2'}\n", "0.990680 (0.001312) with: {'vect__max_df': 0.8, 'clf__dual': True, 'clf__penalty': 'l2'}\n", "0.992544 (0.001309) with: {'vect__max_df': 0.9, 'clf__dual': True, 'clf__penalty': 'l2'}\n", "0.990680 (0.001312) with: {'vect__max_df': 1.0, 'clf__dual': True, 'clf__penalty': 'l2'}\n", "0.990680 (0.001312) with: {'vect__max_df': 0.8, 'clf__dual': False, 'clf__penalty': 'l2'}\n", "0.992544 (0.001309) with: {'vect__max_df': 0.9, 'clf__dual': False, 'clf__penalty': 'l2'}\n", "0.990680 (0.001312) with: {'vect__max_df': 1.0, 'clf__dual': False, 'clf__penalty': 'l2'}\n", "0.971109 (0.003459) with: {'vect__max_df': 0.8, 'clf__dual': False, 'clf__penalty': 'l1'}\n", "0.972041 (0.008199) with: {'vect__max_df': 0.9, 'clf__dual': False, 'clf__penalty': 'l1'}\n", "0.971109 (0.004717) with: {'vect__max_df': 1.0, 'clf__dual': False, 'clf__penalty': 'l1'}\n" ] } ], "source": [ "# summarize results\n", "print(\"Best: %f using %s\" % (grid.best_score_, \n", " grid.best_params_))\n", "means = grid.cv_results_['mean_test_score']\n", "stds = grid.cv_results_['std_test_score']\n", "params = grid.cv_results_['params']\n", "for mean, stdev, param in zip(means, stds, params):\n", " print(\"%f (%f) with: %r\" % (mean, stdev, param))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# now train and predict test instances\n", "# using the best configs\n", "pipeline.set_params(clf__penalty='l2',vect__max_df=0.9,clf__dual=True)\n", "pipeline.fit(X_train,y_train)\n", "y_preds = pipeline.predict(X_test)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.97615708274894808" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# calculate f1\n", "f1_score(y_test, y_preds, average='micro')" ] } ], "metadata": { "kernelspec": { "display_name": "Global TF Kernel (Python 3)", "language": "python", "name": "global-tf-python-3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 2 }