{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# A Linear Model for Bulldozers" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", "\n", "from fastai.imports import *\n", "from fastai.structured import *\n", "from pandas_summary import DataFrameSummary\n", "from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n", "from IPython.display import display\n", "from sklearn import metrics\n", "from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "set_plot_sizes(12,14,16)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load in our data from last lesson" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "PATH = \"data/bulldozers/\"\n", "\n", "df_raw = pd.read_feather('tmp/bulldozers-raw')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_raw['age'] = df_raw.saleYear-df_raw.YearMade" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df, y, nas, mapper = proc_df(df_raw, 'SalePrice', max_n_cat=10, do_scale=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def split_vals(a,n): return a[:n], a[n:]\n", "n_valid = 12000\n", "n_trn = len(df)-n_valid\n", "y_train, y_valid = split_vals(y, n_trn)\n", "raw_train, raw_valid = split_vals(df_raw, n_trn)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def rmse(x,y): return math.sqrt(((x-y)**2).mean())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Linear regression for Bulldozers" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data scaling" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | count | \n", "mean | \n", "std | \n", "min | \n", "25% | \n", "50% | \n", "75% | \n", "max | \n", "
---|---|---|---|---|---|---|---|---|
SalesID | \n", "401125.0 | \n", "3.147483e-15 | \n", "1.000001 | \n", "-0.858580 | \n", "-0.551518 | \n", "-0.308344 | \n", "0.355322 | \n", "4.855369 | \n", "
MachineID | \n", "401125.0 | \n", "2.895953e-15 | \n", "1.000001 | \n", "-2.761738 | \n", "-0.292989 | \n", "0.139657 | \n", "0.567277 | \n", "2.876309 | \n", "
ModelID | \n", "401125.0 | \n", "-1.300773e-13 | \n", "1.000001 | \n", "-1.102854 | \n", "-0.583548 | \n", "-0.367372 | \n", "0.294819 | \n", "4.871330 | \n", "
datasource | \n", "401125.0 | \n", "-6.855920e-13 | \n", "1.000001 | \n", "-1.524823 | \n", "-0.297450 | \n", "-0.297450 | \n", "0.148868 | \n", "4.165727 | \n", "
auctioneerID | \n", "401125.0 | \n", "-1.064768e-13 | \n", "1.000001 | \n", "-0.381738 | \n", "-0.321407 | \n", "-0.261075 | \n", "-0.140412 | \n", "5.591096 | \n", "
YearMade | \n", "401125.0 | \n", "6.797770e-16 | \n", "1.000001 | \n", "-3.081446 | \n", "0.294188 | \n", "0.328458 | \n", "0.345593 | \n", "0.390145 | \n", "
MachineHoursCurrentMeter | \n", "401125.0 | \n", "8.635210e-14 | \n", "1.000001 | \n", "-0.074396 | \n", "-0.074396 | \n", "-0.074396 | \n", "-0.074396 | \n", "150.038499 | \n", "
fiModelDesc | \n", "401125.0 | \n", "1.673709e+03 | \n", "1263.331163 | \n", "1.000000 | \n", "631.000000 | \n", "1395.000000 | \n", "2292.000000 | \n", "4999.000000 | \n", "
fiBaseModel | \n", "401125.0 | \n", "5.591654e+02 | \n", "469.310266 | \n", "1.000000 | \n", "206.000000 | \n", "406.000000 | \n", "704.000000 | \n", "1950.000000 | \n", "
fiSecondaryDesc | \n", "401125.0 | \n", "3.668504e+01 | \n", "38.228243 | \n", "0.000000 | \n", "0.000000 | \n", "29.000000 | \n", "57.000000 | \n", "175.000000 | \n", "
fiModelSeries | \n", "401125.0 | \n", "9.192686e+00 | \n", "27.006859 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "122.000000 | \n", "
fiModelDescriptor | \n", "401125.0 | \n", "1.223343e+01 | \n", "29.041950 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "139.000000 | \n", "
fiProductClassDesc | \n", "401125.0 | \n", "3.226222e+01 | \n", "22.596618 | \n", "1.000000 | \n", "11.000000 | \n", "35.000000 | \n", "52.000000 | \n", "74.000000 | \n", "
state | \n", "401125.0 | \n", "2.350832e+01 | \n", "15.732898 | \n", "1.000000 | \n", "9.000000 | \n", "22.000000 | \n", "41.000000 | \n", "53.000000 | \n", "
Hydraulics | \n", "401125.0 | \n", "4.253801e+00 | \n", "4.809368 | \n", "0.000000 | \n", "1.000000 | \n", "1.000000 | \n", "12.000000 | \n", "12.000000 | \n", "
Tire_Size | \n", "401125.0 | \n", "3.059847e+00 | \n", "5.950781 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "17.000000 | \n", "
Undercarriage_Pad_Width | \n", "401125.0 | \n", "4.311895e+00 | \n", "7.741894 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "19.000000 | \n", "
Stick_Length | \n", "401125.0 | \n", "6.535564e+00 | \n", "11.923308 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "29.000000 | \n", "
saleYear | \n", "401125.0 | \n", "-2.781277e-13 | \n", "1.000001 | \n", "-2.623435 | \n", "-0.711783 | \n", "0.330937 | \n", "0.852297 | \n", "1.199870 | \n", "
saleMonth | \n", "401125.0 | \n", "2.443335e-14 | \n", "1.000001 | \n", "-1.578894 | \n", "-0.994879 | \n", "-0.118857 | \n", "0.757165 | \n", "1.633187 | \n", "
saleWeek | \n", "401125.0 | \n", "-9.469515e-15 | \n", "1.000001 | \n", "-1.702718 | \n", "-0.891252 | \n", "-0.079785 | \n", "0.866926 | \n", "1.813637 | \n", "
saleDay | \n", "401125.0 | \n", "-1.649883e-15 | \n", "1.000001 | \n", "-1.793028 | \n", "-0.843733 | \n", "-0.013101 | \n", "0.817532 | \n", "1.766827 | \n", "
saleDayofweek | \n", "401125.0 | \n", "5.134902e-14 | \n", "1.000001 | \n", "-1.850225 | \n", "-0.427508 | \n", "0.283850 | \n", "0.283850 | \n", "2.417924 | \n", "
saleDayofyear | \n", "401125.0 | \n", "3.377684e-15 | \n", "1.000001 | \n", "-1.718597 | \n", "-0.926784 | \n", "-0.115659 | \n", "0.878936 | \n", "1.786624 | \n", "
saleis_month_end | \n", "401125.0 | \n", "3.491732e-14 | \n", "1.000001 | \n", "-0.186900 | \n", "-0.186900 | \n", "-0.186900 | \n", "-0.186900 | \n", "5.350455 | \n", "
saleis_month_start | \n", "401125.0 | \n", "8.367190e-14 | \n", "1.000001 | \n", "-0.161180 | \n", "-0.161180 | \n", "-0.161180 | \n", "-0.161180 | \n", "6.204230 | \n", "
saleis_quarter_end | \n", "401125.0 | \n", "4.067403e-14 | \n", "1.000001 | \n", "-0.124679 | \n", "-0.124679 | \n", "-0.124679 | \n", "-0.124679 | \n", "8.020586 | \n", "
saleis_quarter_start | \n", "401125.0 | \n", "1.443002e-14 | \n", "1.000001 | \n", "-0.079495 | \n", "-0.079495 | \n", "-0.079495 | \n", "-0.079495 | \n", "12.579339 | \n", "
saleis_year_end | \n", "401125.0 | \n", "-1.624260e-14 | \n", "1.000001 | \n", "-0.001579 | \n", "-0.001579 | \n", "-0.001579 | \n", "-0.001579 | \n", "633.343509 | \n", "
saleis_year_start | \n", "401125.0 | \n", "0.000000e+00 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
Blade_Type_Angle | \n", "401125.0 | \n", "4.143347e-03 | \n", "0.064235 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "1.000000 | \n", "
Blade_Type_Coal | \n", "401125.0 | \n", "2.742287e-05 | \n", "0.005237 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "1.000000 | \n", "
Blade_Type_Landfill | \n", "401125.0 | \n", "6.232471e-05 | \n", "0.007894 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "1.000000 | \n", "
Blade_Type_No | \n", "401125.0 | \n", "1.852290e-03 | \n", "0.042998 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "1.000000 | \n", "
Blade_Type_None or Unspecified | \n", "401125.0 | \n", "2.849735e-02 | \n", "0.166389 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "1.000000 | \n", "
Blade_Type_PAT | \n", "401125.0 | \n", "9.625927e-02 | \n", "0.294947 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "1.000000 | \n", "
Blade_Type_Semi U | \n", "401125.0 | \n", "2.148208e-02 | \n", "0.144985 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "1.000000 | \n", "
Blade_Type_Straight | \n", "401125.0 | \n", "3.321409e-02 | \n", "0.179195 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "1.000000 | \n", "
Blade_Type_U | \n", "401125.0 | \n", "4.641945e-03 | \n", "0.067974 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "1.000000 | \n", "
Blade_Type_VPAT | \n", "401125.0 | \n", "8.842630e-03 | \n", "0.093619 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "1.000000 | \n", "
Blade_Type_nan | \n", "401125.0 | \n", "8.009773e-01 | \n", "0.399266 | \n", "0.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "
Travel_Controls_1 Speed | \n", "401125.0 | \n", "2.492988e-05 | \n", "0.004993 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "1.000000 | \n", "
Travel_Controls_2 Pedal | \n", "401125.0 | \n", "2.846993e-03 | \n", "0.053281 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "1.000000 | \n", "
Travel_Controls_Differential Steer | \n", "401125.0 | \n", "1.216329e-02 | \n", "0.109615 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "1.000000 | \n", "
Travel_Controls_Finger Tip | \n", "401125.0 | \n", "6.541602e-03 | \n", "0.080615 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "1.000000 | \n", "
Travel_Controls_Lever | \n", "401125.0 | \n", "2.094110e-03 | \n", "0.045714 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "1.000000 | \n", "
Travel_Controls_None or Unspecified | \n", "401125.0 | \n", "1.743172e-01 | \n", "0.379383 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "1.000000 | \n", "
Travel_Controls_Pedal | \n", "401125.0 | \n", "1.037083e-03 | \n", "0.032187 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "1.000000 | \n", "
Travel_Controls_nan | \n", "401125.0 | \n", "8.009748e-01 | \n", "0.399268 | \n", "0.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "
Differential_Type_Limited Slip | \n", "401125.0 | \n", "2.817077e-03 | \n", "0.053001 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "1.000000 | \n", "
Differential_Type_Locking | \n", "401125.0 | \n", "4.985977e-06 | \n", "0.002233 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "1.000000 | \n", "
Differential_Type_No Spin | \n", "401125.0 | \n", "5.135556e-04 | \n", "0.022656 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "1.000000 | \n", "
Differential_Type_Standard | \n", "401125.0 | \n", "1.697052e-01 | \n", "0.375374 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "1.000000 | \n", "
Differential_Type_nan | \n", "401125.0 | \n", "8.269592e-01 | \n", "0.378283 | \n", "0.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "
Steering_Controls_Command Control | \n", "401125.0 | \n", "1.338735e-03 | \n", "0.036564 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "1.000000 | \n", "
Steering_Controls_Conventional | \n", "401125.0 | \n", "1.712160e-01 | \n", "0.376698 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "1.000000 | \n", "
Steering_Controls_Four Wheel Standard | \n", "401125.0 | \n", "3.440324e-04 | \n", "0.018545 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "1.000000 | \n", "
Steering_Controls_No | \n", "401125.0 | \n", "2.492988e-06 | \n", "0.001579 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "1.000000 | \n", "
Steering_Controls_Wheel | \n", "401125.0 | \n", "3.490184e-05 | \n", "0.005908 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "1.000000 | \n", "
Steering_Controls_nan | \n", "401125.0 | \n", "8.270639e-01 | \n", "0.378193 | \n", "0.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "
192 rows × 8 columns
\n", "