{ "metadata": { "language": "Julia", "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Basic Statistics in Julia" ] }, { "cell_type": "code", "collapsed": false, "input": [ "using Stats" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "srand(1)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "x = rand(100)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "min(x)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "median(x)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "max(x)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "quantile(x, [0.0, 0.5, 1.0])" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "describe(x)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Probability Distributions in Julia" ] }, { "cell_type": "code", "collapsed": false, "input": [ "using Distributions" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "x = rand(Gamma(1, 2), 100)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Standard R Functions with Simpler Names" ] }, { "cell_type": "code", "collapsed": false, "input": [ "d = Normal(0, 1)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "pdf(d, 0.0)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "cdf(d, 0.0)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "quantile(d, 0.1)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "rand(d)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "rand(Categorical([0.1, 0.9]))" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "rand(sampler(Categorical([0.5, 0.5])))" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "Categorical([0.5, 0.5])" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "sampler(Categorical([0.5, 0.5]))" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Additional Abstractions around PDF's, CDF's, etc." ] }, { "cell_type": "code", "collapsed": false, "input": [ "quantile(d, [0.25, 0.75])" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "-loglikelihood(d, rand(d, 100_000)) / 100_000" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Theoretical Properties of Distributions" ] }, { "cell_type": "code", "collapsed": false, "input": [ "entropy(d)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "mean(d)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "skewness(d)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "kurtosis(d)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "var(d)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "modes(d)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Fit Distributions to Data" ] }, { "cell_type": "code", "collapsed": false, "input": [ "x = rand(d, 1_000)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "fit_mle(Normal, x)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "(mean(d), std(d)), (mean(x), std(x))" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "methods(mean)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Bayesian Updating with Conjugate Priors" ] }, { "cell_type": "code", "collapsed": false, "input": [ "x = rand(Bernoulli(0.9), 10_000)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "posterior(Beta(3, 3), Bernoulli, x)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Kernel Density Estimation" ] }, { "cell_type": "code", "collapsed": false, "input": [ "using Gadfly" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "x = rand(Gamma(3, 3), 100_000)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "k = kde(x)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "names(Distributions.UnivariateKDE)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "set_default_plot_size(25cm, 15cm)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "plot(x = k.x, y = k.density,\n", " Guide.XLabel(\"x\"), Guide.YLabel(\"Estimated Density\"),\n", " Geom.line)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Tabular Data and Missing Values in Julia" ] }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Representing Missing Values" ] }, { "cell_type": "code", "collapsed": false, "input": [ "using DataFrames" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "NA + 1" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "x = DataArray([1, 2, 3])" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "{1, 2, NA}" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "x[1] = NA" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "mean(x)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "x[!isna(x)]" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "mean(x[!isna(x)])" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Factor-Like Variables" ] }, { "cell_type": "code", "collapsed": false, "input": [ "y = PooledDataArray([1, 1, 2, 3])" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "levels(y)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Representing Tabular Data" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df = DataFrame(A = float(1:10), B = rand(10))" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "head(df)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "tail(df)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "df[\"C\"] = repeat([\"G1\", \"G2\"], inner = [5])" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "pool!(df, [\"C\"])" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "df[\"C\"]" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "levels(df[\"C\"])" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "repeat([1 2; 3 4], inner = [2, 1], outer = [1, 2],)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "z = DataArray([1 + 2im])" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "z[1] = NA" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "DataFrame(A = [DataFrame(B = 1:2), DataFrame(C = 3:4)])" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "df[1:10, :]" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "by(df, \"C\", df -> mean(df[\"B\"]))" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "select(:(C .== \"G1\"), df)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "df[:(C .== \"G1\"), :]" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "df[\"C\"] .== \"G1\"" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "with(df, :(A + B))" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Accessing Classical Datasets" ] }, { "cell_type": "code", "collapsed": false, "input": [ "using RDatasets" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "iris = data(\"datasets\", \"iris\")" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "head(iris)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "plot(iris,\n", " x = \"Petal.Length\", y = \"Petal.Width\", color = \"Species\",\n", " Geom.point)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Converting DataFrames to Design Matrices" ] }, { "cell_type": "code", "collapsed": false, "input": [ "ModelMatrix(ModelFrame(:(A ~ B), df))" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "DataFrame I/O" ] }, { "cell_type": "code", "collapsed": false, "input": [ "writetable(\"df.csv\", df)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "df" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "df2 = readtable(\"df.csv\")" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Merging Data Sets" ] }, { "cell_type": "code", "collapsed": false, "input": [ "A = DataFrame(X = 1:3, Z = [\"A\", \"B\", \"C\"])" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "B = DataFrame(Y = 4:6, Z = [\"A\", \"B\", \"B\"])" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "join(A, B, on = \"Z\")" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "join(A, B, on = \"Z\", kind = :inner)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "join(A, B, on = \"Z\", kind = :left)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "join(A, B, on = \"Z\", kind = :right)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "join(A, B, on = \"Z\", kind = :outer)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Split-Apply-Combine Operations" ] }, { "cell_type": "code", "collapsed": false, "input": [ "by(iris, \"Species\", nrow)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "by(iris, \"Species\", df -> mean(df[\"Petal.Length\"]))" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "by(iris, \"Species\", :(N = size(_DF, 1)))" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "GLM's in Julia" ] }, { "cell_type": "code", "collapsed": false, "input": [ "using GLM" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "glm(:(B ~ A), df, Binomial())" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "glm(:(A ~ B), df, Poisson())" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Optimization in Julia" ] }, { "cell_type": "code", "collapsed": false, "input": [ "using Optim" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "f(x::Vector) = (10.73 - x[1])^2 + (1134.29 - x[2])^4" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "f([0.0, 0.0])" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "optimize(f, [0.0, 0.0])" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "optimize(f, [0.0, 0.0], method = :l_bfgs)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Maximum Likelihood Estimation in Julia" ] }, { "cell_type": "code", "collapsed": false, "input": [ "x = rand(Normal(11, 3), 1_000)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "function makenll(x)\n", " nll(params::Vector) = -loglikelihood(Normal(params[1], 3), x)\n", "end" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "nll = makenll(x)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "nll([0.0])" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "nll([10.0])" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "optimize(nll, [0.0])" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "mean(x)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "More resources:\n", "\n", "* NLopt\n", "* JuMP" ] }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "ML Algorithms" ] }, { "cell_type": "code", "collapsed": false, "input": [ "using RDatasets" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "iris = data(\"datasets\", \"iris\")" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "using Clustering" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "kmeans(matrix(iris[:, 2:5])', 3)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "by(iris, \"Species\", df -> DataFrame(A = mean(df[2]),\n", " B = mean(df[3]),\n", " C = mean(df[4]),\n", " D = mean(df[5])))" ], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }