{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Concise example of a basic linear regression in Julia, using `GLM` and `RDatasets`. \n", "\n", "Reference:\n", "* [`GLM` documentation](https://juliastats.org/GLM.jl/stable/)\n", "* [`RDatasets` documentation](https://github.com/JuliaStats/RDatasets.jl)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "using RDatasets, GLM" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

11 rows × 5 columns

PackageDatasetTitleRowsColumns
String15String31StringInt64Int64
1ISLRAutoAuto Data Set3929
2ISLRCaravanThe Insurance Company (TIC) Benchmark582286
3ISLRCarseatsSales of Child Car Seats40011
4ISLRCollegeU.S. News and World Report's College Data77719
5ISLRDefaultCredit Card Default Data100004
6ISLRHittersBaseball Data32220
7ISLROJOrange Juice Data107018
8ISLRPortfolioPortfolio Data1002
9ISLRSmarketS&P Stock Market Data12509
10ISLRWageMid-Atlantic Wage Data300012
11ISLRWeeklyWeekly S&P Stock Market Data10899
" ], "text/latex": [ "\\begin{tabular}{r|ccccc}\n", "\t& Package & Dataset & Title & Rows & Columns\\\\\n", "\t\\hline\n", "\t& String15 & String31 & String & Int64 & Int64\\\\\n", "\t\\hline\n", "\t1 & ISLR & Auto & Auto Data Set & 392 & 9 \\\\\n", "\t2 & ISLR & Caravan & The Insurance Company (TIC) Benchmark & 5822 & 86 \\\\\n", "\t3 & ISLR & Carseats & Sales of Child Car Seats & 400 & 11 \\\\\n", "\t4 & ISLR & College & U.S. News and World Report's College Data & 777 & 19 \\\\\n", "\t5 & ISLR & Default & Credit Card Default Data & 10000 & 4 \\\\\n", "\t6 & ISLR & Hitters & Baseball Data & 322 & 20 \\\\\n", "\t7 & ISLR & OJ & Orange Juice Data & 1070 & 18 \\\\\n", "\t8 & ISLR & Portfolio & Portfolio Data & 100 & 2 \\\\\n", "\t9 & ISLR & Smarket & S\\&P Stock Market Data & 1250 & 9 \\\\\n", "\t10 & ISLR & Wage & Mid-Atlantic Wage Data & 3000 & 12 \\\\\n", "\t11 & ISLR & Weekly & Weekly S\\&P Stock Market Data & 1089 & 9 \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m11×5 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m Package \u001b[0m\u001b[1m Dataset \u001b[0m\u001b[1m Title \u001b[0m\u001b[1m Rows \u001b[0m\u001b[1m Columns \u001b[0m\n", "\u001b[1m \u001b[0m│\u001b[90m String15 \u001b[0m\u001b[90m String31 \u001b[0m\u001b[90m String \u001b[0m\u001b[90m Int64 \u001b[0m\u001b[90m Int64 \u001b[0m\n", "─────┼────────────────────────────────────────────────────────────────────────\n", " 1 │ ISLR Auto Auto Data Set 392 9\n", " 2 │ ISLR Caravan The Insurance Company (TIC) Benc… 5822 86\n", " 3 │ ISLR Carseats Sales of Child Car Seats 400 11\n", " 4 │ ISLR College U.S. News and World Report's Col… 777 19\n", " 5 │ ISLR Default Credit Card Default Data 10000 4\n", " 6 │ ISLR Hitters Baseball Data 322 20\n", " 7 │ ISLR OJ Orange Juice Data 1070 18\n", " 8 │ ISLR Portfolio Portfolio Data 100 2\n", " 9 │ ISLR Smarket S&P Stock Market Data 1250 9\n", " 10 │ ISLR Wage Mid-Atlantic Wage Data 3000 12\n", " 11 │ ISLR Weekly Weekly S&P Stock Market Data 1089 9" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# List Datasets in the RDatasets julia library, in the \"ISLR\" package (there's lots more)\n", "RDatasets.datasets(\"ISLR\")" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

5 rows × 9 columns (omitted printing of 1 columns)

MPGCylindersDisplacementHorsepowerWeightAccelerationYearOrigin
Float64Float64Float64Float64Float64Float64Float64Float64
118.08.0307.0130.03504.012.070.01.0
215.08.0350.0165.03693.011.570.01.0
318.08.0318.0150.03436.011.070.01.0
416.08.0304.0150.03433.012.070.01.0
517.08.0302.0140.03449.010.570.01.0
" ], "text/latex": [ "\\begin{tabular}{r|ccccccccc}\n", "\t& MPG & Cylinders & Displacement & Horsepower & Weight & Acceleration & Year & Origin & \\\\\n", "\t\\hline\n", "\t& Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & Float64 & \\\\\n", "\t\\hline\n", "\t1 & 18.0 & 8.0 & 307.0 & 130.0 & 3504.0 & 12.0 & 70.0 & 1.0 & $\\dots$ \\\\\n", "\t2 & 15.0 & 8.0 & 350.0 & 165.0 & 3693.0 & 11.5 & 70.0 & 1.0 & $\\dots$ \\\\\n", "\t3 & 18.0 & 8.0 & 318.0 & 150.0 & 3436.0 & 11.0 & 70.0 & 1.0 & $\\dots$ \\\\\n", "\t4 & 16.0 & 8.0 & 304.0 & 150.0 & 3433.0 & 12.0 & 70.0 & 1.0 & $\\dots$ \\\\\n", "\t5 & 17.0 & 8.0 & 302.0 & 140.0 & 3449.0 & 10.5 & 70.0 & 1.0 & $\\dots$ \\\\\n", "\\end{tabular}\n" ], "text/plain": [ "\u001b[1m5×9 DataFrame\u001b[0m\n", "\u001b[1m Row \u001b[0m│\u001b[1m MPG \u001b[0m\u001b[1m Cylinders \u001b[0m\u001b[1m Displacement \u001b[0m\u001b[1m Horsepower \u001b[0m\u001b[1m Weight \u001b[0m\u001b[1m Acceleration \u001b[0m\u001b[1m Ye\u001b[0m ⋯\n", "\u001b[1m \u001b[0m│\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Fl\u001b[0m ⋯\n", "─────┼──────────────────────────────────────────────────────────────────────────\n", " 1 │ 18.0 8.0 307.0 130.0 3504.0 12.0 ⋯\n", " 2 │ 15.0 8.0 350.0 165.0 3693.0 11.5\n", " 3 │ 18.0 8.0 318.0 150.0 3436.0 11.0\n", " 4 │ 16.0 8.0 304.0 150.0 3433.0 12.0\n", " 5 │ 17.0 8.0 302.0 140.0 3449.0 10.5 ⋯\n", "\u001b[36m 3 columns omitted\u001b[0m" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# RDatasets has a dataset() function to reads in a dataset from the catalog\n", "auto = dataset(\"ISLR\", \"Auto\");\n", "first(auto, 5)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}}}}, Matrix{Float64}}\n", "\n", "MPG ~ 1 + Cylinders + Displacement\n", "\n", "Coefficients:\n", "─────────────────────────────────────────────────────────────────────────────\n", " Coef. Std. Error t Pr(>|t|) Lower 95% Upper 95%\n", "─────────────────────────────────────────────────────────────────────────────\n", "(Intercept) 36.5377 1.19661 30.53 <1e-99 34.1851 38.8903\n", "Cylinders -0.576348 0.443276 -1.30 0.1943 -1.44786 0.295169\n", "Displacement -0.0511185 0.00722576 -7.07 <1e-11 -0.0653249 -0.0369121\n", "─────────────────────────────────────────────────────────────────────────────" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Call GLM, note you just use the column names without any Str of Symbol stuff\n", "ols = lm(@formula(MPG ~ Cylinders + Displacement), auto)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Julia 1.7.0", "language": "julia", "name": "julia-1.7" }, "language_info": { "file_extension": ".jl", "mimetype": "application/julia", "name": "julia", "version": "1.7.0" } }, "nbformat": 4, "nbformat_minor": 4 }