{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import quilt\n", "from quilt.data.uciml import iris" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "iris" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train = iris.tables.bezdek_iris()\n", "trainvecs = train.values[:,:4]\n", "labels = train['class'].values" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn import svm\n", "C = 1.0 # magic regularization parameter\n", "models = (svm.SVC(kernel='linear', C=C),\n", " svm.LinearSVC(C=C),\n", " svm.SVC(kernel='rbf', gamma=0.7, C=C),\n", " svm.SVC(kernel='poly', degree=3, C=C))\n", "models = (clf.fit(trainvecs, labels) for clf in models)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import cross_val_score as cvs\n", "scores = [cvs(m, trainvecs, labels, cv=5) for m in models]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "scores" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "[a.mean() for a in scores]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from quilt.data.akarve import pydata_book as pb\n", "pb.titanic" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Feature engineering" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train = pb.titanic.train()\n", "test = pb.titanic.test()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# check for nulls\n", "train.isnull().sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "test.isnull().sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "test['IsMale'] = (test['Sex'] == 'male').astype(int)\n", "train['IsMale'] = (train['Sex'] == 'male').astype(int)\n", "test['NumRelatives'] = test['SibSp'] + test['Parch']\n", "train['NumRelatives'] = train['SibSp'] + train['Parch']\n", "features = ['Pclass', 'IsMale', 'Age', 'NumRelatives']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# imputation\n", "age_median = train['Age'].median()\n", "age_mean = train['Age'].mean()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "age_mean" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train['AgeImputeMean'] = train['Age'].fillna(age_mean)\n", "test['AgeImputeMean'] = test['Age'].fillna(age_mean)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "# select four features we care about\n", "features = ['Pclass', 'IsMale', 'NumRelatives', 'AgeImputeMean']\n", "# store updated data\n", "pb._set(['titanic', 'features'], pd.DataFrame([features]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Training" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "features = pb.titanic.features()\n", "train = pb.titanic.train()\n", "trainsub = train[features.values[0]]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "trainvecs = trainsub.values\n", "trainlabels = train['Survived'].values" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## train" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import cross_val_score as cvs\n", "from sklearn.ensemble import RandomForestClassifier\n", "\n", "rfc = RandomForestClassifier(max_depth=3, random_state=0)\n", "scores = cvs(rfc, trainvecs, trainlabels, cv=5)\n", "scores.mean()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.3" } }, "nbformat": 4, "nbformat_minor": 2 }