{ "cells": [ { "cell_type": "markdown", "id": "d5fc9cae-a2f2-4c65-bc89-67dfe8d25381", "metadata": {}, "source": [ "# Getting even faster by reducing allocations and by multi-threading\n", "\n", "* Gen Kuroki\n", "* 2021-06-28" ] }, { "cell_type": "code", "execution_count": 1, "id": "e86caf9d-95d3-4c28-84ff-03dbbe0f939e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "v\"1.6.1\"" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "VERSION" ] }, { "cell_type": "markdown", "id": "19a7e753-1581-430b-a885-59744332315a", "metadata": {}, "source": [ "## Original version\n", "\n", "* https://twitter.com/elbersb/status/1409454534666207232\n", "* https://elbersb.com/public/posts/interaction_simulation/" ] }, { "cell_type": "code", "execution_count": 2, "id": "a8a639f2-ba1d-42e6-93d4-2b51ff273f64", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "gen_data (generic function with 1 method)" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "using Distributions \n", "using DataFrames \n", "\n", "const std_normal = Normal(0, 1)\n", "\n", "function gen_data()\n", " ## total time periods in the the panel = 500\n", " tt = 500\n", "\n", " # x1 and x2 covariates\n", " x1_A = 1 .+ rand(std_normal, tt)\n", " x1_B = 1/4 .+ rand(std_normal, tt)\n", " x2_A = 1 .+ x1_A .+ rand(std_normal, tt)\n", " x2_B = 1 .+ x1_B .+ rand(std_normal, tt)\n", "\n", " # outcomes (notice different slope coefs for x2_A and x2_B)\n", " y_A = x1_A .+ 1*x2_A + rand(std_normal, tt)\n", " y_B = x1_B .+ 2*x2_B + rand(std_normal, tt)\n", "\n", " # combine\n", " DataFrame(\n", " id = vcat(fill(0, length(x1_A)), fill(1, length(x1_B))),\n", " x1 = vcat(x1_A, x1_B),\n", " x2 = vcat(x2_A, x2_B),\n", " x1_dmean = vcat(x1_A .- mean(x1_A), x1_B .- mean(x1_B)),\n", " x2_dmean = vcat(x2_A .- mean(x2_A), x2_B .- mean(x2_B)),\n", " y = vcat(y_A, y_B))\n", "end" ] }, { "cell_type": "code", "execution_count": 3, "id": "2a00cf51-ad83-4f88-8101-e3cb27a68e2c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(-0.14620466773983154, -0.026738115648363484)" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "using GLM\n", "\n", "function coefs_lm_formula(data)\n", " mod_level = lm(@formula(y ~ id + x1 * x2), data)\n", " mod_dmean = lm(@formula(y ~ id + x1_dmean * x2_dmean), data)\n", " (coef(mod_level)[end], coef(mod_dmean)[end])\n", "end\n", "\n", "# example\n", "data = gen_data()\n", "coefs_lm_formula(data)" ] }, { "cell_type": "code", "execution_count": 4, "id": "d0c50465-d648-46df-9c6a-23bc9b216756", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 8.495 s (23260002 allocations: 16.87 GiB)\n" ] } ], "source": [ "function run_simulations(nsim)\n", " sims = zeros(nsim, 2);\n", " for i in 1:nsim\n", " data = gen_data()\n", " sims[i, :] .= coefs_lm_formula(data)\n", " end\n", " sims\n", "end\n", "\n", "using BenchmarkTools\n", "n = 20000\n", "@btime run_simulations($n);" ] }, { "cell_type": "code", "execution_count": 5, "id": "e2c5ab83-330f-4db3-8176-22e588b8aca4", "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "using Plots\n", "sims = run_simulations(n)\n", "histogram(sims, label = [\"level\" \"dmean\"])" ] }, { "cell_type": "code", "execution_count": 6, "id": "d605839d-e40d-4b86-b3f4-62211c1d6ed7", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 3.372 s (2800002 allocations: 8.30 GiB)\n" ] } ], "source": [ "function coefs_lm_formula(data)\n", " constant = fill(1, nrow(data))\n", " X = Float64[constant data.id data.x1 data.x2 data.x1 .* data.x2]\n", " mod_level = fit(LinearModel, X, data.y)\n", " X[:, 5] .= data.x1_dmean .* data.x2_dmean\n", " mod_dmean = fit(LinearModel, X, data.y)\n", " (coef(mod_level)[end], coef(mod_dmean)[end])\n", "end\n", "\n", "@btime run_simulations($n);" ] }, { "cell_type": "code", "execution_count": 7, "id": "fca6149a-9d76-4e94-b306-40c3ae059eeb", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 2.191 s (2040002 allocations: 4.65 GiB)\n" ] } ], "source": [ "using LinearAlgebra \n", "fastfit(X, y) = cholesky!(Symmetric(X' * X)) \\ (X' * y)\n", "\n", "function coefs_lm_formula(data)\n", " constant = fill(1, nrow(data))\n", " X = Float64[constant data.id data.x1 data.x2 data.x1 .* data.x2]\n", " mod_level = fastfit(X, data.y)\n", " X[:, 5] .= data.x1_dmean .* data.x2_dmean\n", " mod_dmean = fastfit(X, data.y)\n", " (mod_level[end], mod_dmean[end])\n", "end\n", "\n", "@btime run_simulations($n);" ] }, { "cell_type": "markdown", "id": "da8e5a2d-ee9f-4e1d-9d1a-04016b1bbae1", "metadata": {}, "source": [ "## Optimized version: Reducing allocations\n", "\n", "Ref. [Performance Tips: Pre-allocating outputs](https://docs.julialang.org/en/v1/manual/performance-tips/#Pre-allocating-outputs)" ] }, { "cell_type": "code", "execution_count": 8, "id": "62c9e980-50c5-45fc-b1e6-9897721f0c57", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "n = 20000 = 20000\n", " 1.841701 seconds (4.02 M allocations: 212.852 MiB, 2.88% gc time, 45.74% compilation time)\n" ] }, { "data": { "image/svg+xml": [ "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "using DataFrames\n", "using Distributions\n", "using LinearAlgebra\n", "using Random: rand!\n", "using BenchmarkTools\n", "using Plots\n", "\n", "const std_normal = Normal(0, 1)\n", "\n", "function undef_data(tt)\n", " id = vcat(fill(0, tt), fill(1, tt))\n", " x1 = similar(id, Float64)\n", " x2 = similar(x1)\n", " x1_dmean = similar(x1)\n", " x2_dmean = similar(x2)\n", " y = similar(x1)\n", " DataFrame(id = id, x1 = x1, x2 = x2, x1_dmean = x1_dmean, x2_dmean = x2_dmean, y = y)\n", "end\n", "\n", "function gen_data!(data, tmp)\n", " tt = length(tmp) # Assume 2tt = nrow(data)\n", " A, B = 1:tt, tt+1:2tt\n", " \n", " # Warning: dataframe is type-unstable\n", " x1::Vector{Float64} = data.x1\n", " x2::Vector{Float64} = data.x2\n", " x1_dmean::Vector{Float64} = data.x1_dmean\n", " x2_dmean::Vector{Float64} = data.x2_dmean\n", " y::Vector{Float64} = data.y\n", " \n", " @views x1[A] .= 1 .+ rand!(std_normal, tmp)\n", " @views x1[B] .= 1/4 .+ rand!(std_normal, tmp)\n", " @views x2[A] .= 1 .+ x1[A] .+ rand!(std_normal, tmp)\n", " @views x2[B] .= 1 .+ x1[B] .+ rand!(std_normal, tmp)\n", "\n", " # outcomes (notice different slope coefs for x2[A] and x2[B])\n", " @views y[A] .= x1[A] .+ 1 .* x2[A] .+ rand!(std_normal, tmp)\n", " @views y[B] .= x1[B] .+ 2 .* x2[B] .+ rand!(std_normal, tmp)\n", " \n", " @views x1_dmean[A] .= x1[A] .- mean(x1[A])\n", " @views x1_dmean[B] .= x1[B] .- mean(x1[B])\n", " @views x2_dmean[A] .= x2[A] .- mean(x2[A])\n", " @views x2_dmean[B] .= x2[B] .- mean(x2[B])\n", " \n", " data\n", "end\n", "\n", "function fastfit!(mod, X, y, XtX, Xty)\n", " mul!(XtX, X', X)\n", " mul!(Xty, X', y)\n", " ldiv!(mod, cholesky!(Symmetric(XtX)), Xty)\n", "end\n", "\n", "function coefs_lm_formula!(result, data, X, mod_level, mod_dmean, XtX, Xty)\n", " # Warning: dataframe is type-unstable.\n", " id::Vector{Int} = data.id\n", " x1::Vector{Float64} = data.x1\n", " x2::Vector{Float64} = data.x2\n", " x1_dmean::Vector{Float64} = data.x1_dmean\n", " x2_dmean::Vector{Float64} = data.x2_dmean\n", " y::Vector{Float64} = data.y\n", " \n", " X[:, 1] .= 1\n", " X[:, 2] .= id\n", " X[:, 3] .= x1\n", " X[:, 4] .= x2\n", " X[:, 5] .= x1 .* x2\n", " fastfit!(mod_level, X, y, XtX, Xty)\n", "\n", " X[:, 5] .= x1_dmean .* x2_dmean\n", " fastfit!(mod_dmean, X, y, XtX, Xty)\n", " \n", " result .= (mod_level[end], mod_dmean[end])\n", "end\n", "\n", "function run_simulations_optimized(nsim, tt=500)\n", " sims = Matrix{Float64}(undef, nsim, 2)\n", " data = undef_data(tt)\n", " tmp = Vector{Float64}(undef, tt)\n", " X = Matrix{Float64}(undef, 2tt, 5)\n", " mod_level = Vector{Float64}(undef, 5)\n", " mod_dmean = similar(mod_level)\n", " XtX = Matrix{Float64}(undef, 5, 5)\n", " Xty = Vector{Float64}(undef, 5)\n", " for i in 1:nsim\n", " gen_data!(data, tmp)\n", " @views coefs_lm_formula!(sims[i, :], data, X, mod_level, mod_dmean, XtX, Xty)\n", " end\n", " sims\n", "end\n", "\n", "@show n = 20000\n", "@time sims = run_simulations_optimized(n)\n", "histogram(sims, label = [\"level\" \"dmean\"])" ] }, { "cell_type": "code", "execution_count": 9, "id": "5d44270c-3e3c-4b7d-bca0-e8dab8765fef", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Optimized version:\n", " 27.100 μs (0 allocations: 0 bytes)\n", " 14.700 μs (8 allocations: 288 bytes)\n", " 907.061 ms (160062 allocations: 5.94 MiB)\n" ] } ], "source": [ "tt = 500\n", "data = undef_data(tt)\n", "tmp = Vector{Float64}(undef, tt)\n", "X = Matrix{Float64}(undef, 2tt, 5)\n", "mod_level = Vector{Float64}(undef, 5)\n", "mod_dmean = similar(mod_level)\n", "XtX = Matrix{Float64}(undef, 5, 5)\n", "Xty = Vector{Float64}(undef, 5)\n", "sim = zeros(2)\n", "n = 20000\n", "\n", "println(\"Optimized version:\")\n", "@btime gen_data!($data, $tmp)\n", "@btime coefs_lm_formula!($sim, $data, $X, $mod_level, $mod_dmean, $XtX, $Xty)\n", "@btime run_simulations_optimized($n);" ] }, { "cell_type": "code", "execution_count": 10, "id": "56711e76-1862-4902-a727-e0e4d9cf220a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Original version:\n", " 41.000 μs (73 allocations: 187.31 KiB)\n", " 21.100 μs (26 allocations: 56.52 KiB)\n", " 2.252 s (2040002 allocations: 4.65 GiB)\n" ] } ], "source": [ "println(\"Original version:\")\n", "@btime gen_data()\n", "@btime coefs_lm_formula($data)\n", "@btime run_simulations($n);" ] }, { "cell_type": "markdown", "id": "9661eea1-be1a-4562-b435-f9fa52a0f2e6", "metadata": {}, "source": [ "## Multi-thread original version" ] }, { "cell_type": "code", "execution_count": 11, "id": "2b34fb4e-e8f4-4fc7-a008-f2f3fdf11829", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Threads.nthreads() = 12\n", " 1.802908 seconds (2.14 M allocations: 4.659 GiB, 40.37% gc time, 1.47% compilation time)\n" ] }, { "data": { "image/svg+xml": [ "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "function run_simulations_multithread(nsim)\n", " sims = zeros(nsim, 2);\n", " Threads.@threads for i in 1:nsim\n", " data = gen_data()\n", " sims[i, :] .= coefs_lm_formula(data)\n", " end\n", " sims\n", "end\n", "\n", "@show Threads.nthreads()\n", "@time sims = run_simulations_multithread(n)\n", "histogram(sims, label = [\"level\" \"dmean\"])" ] }, { "cell_type": "code", "execution_count": 12, "id": "a8b60be5-3565-4288-9f71-ee184b765d52", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Multi-thread original version: \n", "Threads.nthreads() = 12\n", " 1.527 s (2040068 allocations: 4.65 GiB)\n" ] } ], "source": [ "println(\"Multi-thread original version: \")\n", "@show Threads.nthreads()\n", "@btime run_simulations_multithread($n);" ] }, { "cell_type": "markdown", "id": "5b3146d8-266f-441c-97c6-e0f1d4b62fa1", "metadata": {}, "source": [ "## Multi-thread optimized version" ] }, { "cell_type": "code", "execution_count": 13, "id": "993f4daa-98c6-46f2-a8ca-8a1fe22e1f84", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Threads.nthreads() = 12\n", " 0.204476 seconds (229.28 k allocations: 11.634 MiB, 5.18% compilation time)\n" ] }, { "data": { "image/svg+xml": [ "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# pkg> dev https://github.com/genkuroki/MyUtils.jl\n", "using MyUtils: @my_threads\n", "\n", "function run_simulations_multithread_optimized(nsim, tt=500)\n", " sims = Matrix{Float64}(undef, nsim, 2)\n", " @my_threads begin\n", " data = undef_data(tt)\n", " tmp = Vector{Float64}(undef, tt)\n", " X = Matrix{Float64}(undef, 2tt, 5)\n", " mod_level = Vector{Float64}(undef, 5)\n", " mod_dmean = similar(mod_level)\n", " XtX = Matrix{Float64}(undef, 5, 5)\n", " Xty = Vector{Float64}(undef, 5)\n", " end for i in 1:nsim\n", " gen_data!(data, tmp)\n", " @views coefs_lm_formula!(sims[i, :], data, X, mod_level, mod_dmean, XtX, Xty)\n", " end begin\n", " end\n", " sims\n", "end\n", "\n", "@show Threads.nthreads()\n", "@time sims = run_simulations_multithread_optimized(n)\n", "histogram(sims, label = [\"level\" \"dmean\"])" ] }, { "cell_type": "code", "execution_count": 14, "id": "08864d8f-acc3-4cdb-b883-2be11e9142cf", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Multi-thread optimized version: \n", "Threads.nthreads() = 12\n", " 169.589 ms (160786 allocations: 7.56 MiB)\n" ] } ], "source": [ "println(\"Multi-thread optimized version: \")\n", "@show Threads.nthreads()\n", "@btime run_simulations_multithread_optimized($n);" ] }, { "cell_type": "markdown", "id": "7ac1b051-a13e-4e48-a341-2d62f64b2293", "metadata": {}, "source": [ "## non-DataFrame version\n", "\n", "https://discourse.julialang.org/t/simulation-of-regression-coefficients/63683/2" ] }, { "cell_type": "code", "execution_count": 15, "id": "ee3f65e4-e49c-400b-afbb-c9130381b3f2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "run_simulations5 (generic function with 1 method)" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "using Distributions\n", "using LinearAlgebra\n", "using BenchmarkTools\n", "using Plots\n", "using Base.Threads\n", "\n", "const std_normal = Normal(0, 1)\n", "fastfit(X, y) = cholesky!(Symmetric(X' * X)) \\ (X' * y)\n", "\n", "function gen_data2!(X, y, x1_dmean, x2_dmean)\n", " ## total time periods in the the panel = 500\n", " tt = 500\n", "\n", " # x1 and x2 covariates\n", " x1_A = 1 .+ rand(std_normal, tt)\n", " x1_B = 1/4 .+ rand(std_normal, tt)\n", " x2_A = 1 .+ x1_A .+ rand(std_normal, tt)\n", " x2_B = 1 .+ x1_B .+ rand(std_normal, tt)\n", "\n", " # outcomes (notice different slope coefs for x2_A and x2_B)\n", " y_A = x1_A .+ 1*x2_A + rand(std_normal, tt)\n", " y_B = x1_B .+ 2*x2_B + rand(std_normal, tt)\n", " x1 = vcat(x1_A, x1_B)\n", " x2 = vcat(x2_A, x2_B)\n", "\n", " X[:, 2] .= vcat(fill(0, length(x1_A)), fill(1, length(x1_B)))\n", " X[:, 3] .= x1\n", " X[:, 4] .= x2\n", " X[:, 5] .= (x1 .* x2)\n", "\n", " y .= vcat(y_A, y_B)\n", "\n", " x1_dmean .= vcat(x1_A .- mean(x1_A), x1_B .- mean(x1_B))\n", " x2_dmean .= vcat(x2_A .- mean(x2_A), x2_B .- mean(x2_B))\n", "end\n", "\n", "function coefs_lm_formula4(X, y, x1_dmean, x2_dmean)\n", " mod_level = fastfit(X, y)\n", " @inbounds X[:, 5] .= x1_dmean .* x2_dmean\n", " mod_dmean = fastfit(X, y)\n", " (mod_level[end], mod_dmean[end])\n", "end\n", "\n", "function run_simulations5(nsim)\n", " sims = zeros(nsim, 2);\n", " X = [zeros(Float64, 1000, 5) for _ in 1:nthreads()]\n", " y = [zeros(Float64, 1000) for _ in 1:nthreads()]\n", " x1_dmean = [zeros(Float64, 1000) for _ in 1:nthreads()]\n", " x2_dmean = [zeros(Float64, 1000) for _ in 1:nthreads()]\n", "\n", " # set constant\n", " for i in 1:nthreads()\n", " X[i][:, 1] .= 1\n", " end\n", "\n", " @threads for i in 1:nsim\n", " gen_data2!(X[threadid()], y[threadid()], x1_dmean[threadid()], x2_dmean[threadid()])\n", " @inbounds sims[i, 1], sims[i, 2] = coefs_lm_formula4(X[threadid()], y[threadid()], x1_dmean[threadid()], x2_dmean[threadid()])\n", " end\n", " sims\n", "end" ] }, { "cell_type": "code", "execution_count": 16, "id": "e9639558-5216-45e1-b1bc-974988c56c59", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Threads.nthreads() = 12\n", " 1.447246 seconds (2.48 M allocations: 2.726 GiB, 24.63% gc time, 10.16% compilation time)\n" ] }, { "data": { "image/svg+xml": [ "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "n = 20000\n", "@show Threads.nthreads()\n", "@time sims = run_simulations5(n)\n", "histogram(sims, label = [\"level\" \"dmean\"])" ] }, { "cell_type": "code", "execution_count": 17, "id": "2d3fc781-986d-4d90-9dad-264284d3c2b6", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Multi-thread non-dataframe version (run_simulations5): \n", "Threads.nthreads() = 12\n", " 664.780 ms (760131 allocations: 2.64 GiB)\n" ] } ], "source": [ "println(\"Multi-thread non-dataframe version (run_simulations5): \")\n", "@show Threads.nthreads()\n", "@btime run_simulations5($n);" ] }, { "cell_type": "markdown", "id": "e21456c5-441a-4382-aa20-67d05ba3bb08", "metadata": {}, "source": [ "## Optimized non-DataFrame version" ] }, { "cell_type": "code", "execution_count": 18, "id": "ebd9b54e-da8d-46fd-a6df-b4553a80f122", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "n = 20000 = 20000\n", " 1.514930 seconds (2.50 M allocations: 128.396 MiB, 3.07% gc time, 35.76% compilation time)\n" ] }, { "data": { "image/svg+xml": [ "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "using Distributions\n", "using LinearAlgebra\n", "using Random: rand!\n", "using BenchmarkTools\n", "using Plots\n", "\n", "const std_normal = Normal(0, 1)\n", "\n", "# use NamedTuple indtead of DataFrame\n", "function undef_data_namedtuple(tt)\n", " id = vcat(fill(0, tt), fill(1, tt))\n", " x1 = similar(id, Float64)\n", " x2 = similar(x1)\n", " x1_dmean = similar(x1)\n", " x2_dmean = similar(x2)\n", " y = similar(x1)\n", " (; id, x1, x2, x1_dmean, x2_dmean, y)\n", "end\n", "\n", "function gen_data!(data, tmp)\n", " tt = length(tmp) # Assume 2tt = nrow(data)\n", " A, B = 1:tt, tt+1:2tt\n", " \n", " # Warning: dataframe is type-unstable\n", " x1::Vector{Float64} = data.x1\n", " x2::Vector{Float64} = data.x2\n", " x1_dmean::Vector{Float64} = data.x1_dmean\n", " x2_dmean::Vector{Float64} = data.x2_dmean\n", " y::Vector{Float64} = data.y\n", " \n", " @views x1[A] .= 1 .+ rand!(std_normal, tmp)\n", " @views x1[B] .= 1/4 .+ rand!(std_normal, tmp)\n", " @views x2[A] .= 1 .+ x1[A] .+ rand!(std_normal, tmp)\n", " @views x2[B] .= 1 .+ x1[B] .+ rand!(std_normal, tmp)\n", "\n", " # outcomes (notice different slope coefs for x2[A] and x2[B])\n", " @views y[A] .= x1[A] .+ 1 .* x2[A] .+ rand!(std_normal, tmp)\n", " @views y[B] .= x1[B] .+ 2 .* x2[B] .+ rand!(std_normal, tmp)\n", " \n", " @views x1_dmean[A] .= x1[A] .- mean(x1[A])\n", " @views x1_dmean[B] .= x1[B] .- mean(x1[B])\n", " @views x2_dmean[A] .= x2[A] .- mean(x2[A])\n", " @views x2_dmean[B] .= x2[B] .- mean(x2[B])\n", " \n", " data\n", "end\n", "\n", "function fastfit!(mod, X, y, XtX, Xty)\n", " mul!(XtX, X', X)\n", " mul!(Xty, X', y)\n", " ldiv!(mod, cholesky!(Symmetric(XtX)), Xty)\n", "end\n", "\n", "function coefs_lm_formula!(result, data, X, mod_level, mod_dmean, XtX, Xty)\n", " # Warning: dataframe is type-unstable.\n", " id::Vector{Int} = data.id\n", " x1::Vector{Float64} = data.x1\n", " x2::Vector{Float64} = data.x2\n", " x1_dmean::Vector{Float64} = data.x1_dmean\n", " x2_dmean::Vector{Float64} = data.x2_dmean\n", " y::Vector{Float64} = data.y\n", " \n", " X[:, 1] .= 1\n", " X[:, 2] .= id\n", " X[:, 3] .= x1\n", " X[:, 4] .= x2\n", " X[:, 5] .= x1 .* x2\n", " fastfit!(mod_level, X, y, XtX, Xty)\n", "\n", " X[:, 5] .= x1_dmean .* x2_dmean\n", " fastfit!(mod_dmean, X, y, XtX, Xty)\n", " \n", " result .= (mod_level[end], mod_dmean[end])\n", "end\n", "\n", "function run_simulations_optimized_nondataframe(nsim, tt=500)\n", " sims = Matrix{Float64}(undef, nsim, 2)\n", " data = undef_data_namedtuple(tt) # use NamedTuple indtead of DataFrame\n", " tmp = Vector{Float64}(undef, tt)\n", " X = Matrix{Float64}(undef, 2tt, 5)\n", " mod_level = Vector{Float64}(undef, 5)\n", " mod_dmean = similar(mod_level)\n", " XtX = Matrix{Float64}(undef, 5, 5)\n", " Xty = Vector{Float64}(undef, 5)\n", " for i in 1:nsim\n", " gen_data!(data, tmp)\n", " @views coefs_lm_formula!(sims[i, :], data, X, mod_level, mod_dmean, XtX, Xty)\n", " end\n", " sims\n", "end\n", "\n", "@show n = 20000\n", "@time sims = run_simulations_optimized_nondataframe(n)\n", "histogram(sims, label = [\"level\" \"dmean\"])" ] }, { "cell_type": "code", "execution_count": 19, "id": "769c9917-0f7e-492d-a793-f62999fc6a34", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Optimized non-dataframe version (single thread): \n", " 885.032 ms (160017 allocations: 5.90 MiB)\n" ] } ], "source": [ "println(\"Optimized non-dataframe version (single thread): \")\n", "@btime run_simulations_optimized_nondataframe($n);" ] }, { "cell_type": "markdown", "id": "2ee0a401-b16d-4da3-9551-7ebbcdfcd563", "metadata": {}, "source": [ "## Multi-thread optimized non-DataFrame version" ] }, { "cell_type": "code", "execution_count": 20, "id": "ce74687b-be7c-4e86-bc38-558d184e8317", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Threads.nthreads() = 12\n", " 0.211943 seconds (228.71 k allocations: 11.044 MiB, 6.34% compilation time)\n" ] }, { "data": { "image/svg+xml": [ "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# pkg> dev https://github.com/genkuroki/MyUtils.jl\n", "using MyUtils: @my_threads\n", "\n", "function run_simulations_multithread_optimized_nondataframe(nsim, tt=500)\n", " sims = Matrix{Float64}(undef, nsim, 2)\n", " @my_threads begin\n", " data = undef_data_namedtuple(tt)\n", " tmp = Vector{Float64}(undef, tt)\n", " X = Matrix{Float64}(undef, 2tt, 5)\n", " mod_level = Vector{Float64}(undef, 5)\n", " mod_dmean = similar(mod_level)\n", " XtX = Matrix{Float64}(undef, 5, 5)\n", " Xty = Vector{Float64}(undef, 5)\n", " end for i in 1:nsim\n", " gen_data!(data, tmp)\n", " @views coefs_lm_formula!(sims[i, :], data, X, mod_level, mod_dmean, XtX, Xty)\n", " end begin\n", " end\n", " sims\n", "end\n", "\n", "@show Threads.nthreads()\n", "@time sims = run_simulations_multithread_optimized_nondataframe(n)\n", "histogram(sims, label = [\"level\" \"dmean\"])" ] }, { "cell_type": "code", "execution_count": 21, "id": "2ee53979-a586-405e-8cac-4d04d5c86760", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Multi-thread optimized non-dataframe version: \n", "Threads.nthreads() = 12\n", " 163.337 ms (160247 allocations: 6.97 MiB)\n" ] } ], "source": [ "println(\"Multi-thread optimized non-dataframe version: \")\n", "@show Threads.nthreads()\n", "@btime run_simulations_multithread_optimized_nondataframe($n);" ] }, { "cell_type": "markdown", "id": "f842c2ca-152b-4ed3-978a-f4bc54da3320", "metadata": {}, "source": [ "## Summary" ] }, { "cell_type": "code", "execution_count": 24, "id": "67e7afe6-c3a8-404e-b4a7-fa9f38397fd3", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "VERSION = v\"1.6.1\"\n", "n = 20000\n", "\n", "Original version (with DataFrame, single thread):\n", " 1.836 s (2040002 allocations: 4.65 GiB)\n", "\n", "Optimized version (with DataFrame, single thread):\n", " 918.817 ms (160062 allocations: 5.94 MiB)\n", "\n", "Multi-thread optimized version (with DataFrame): \n", "Threads.nthreads() = 12\n", " 168.350 ms (160786 allocations: 7.56 MiB)\n", "\n", "Multi-thread non-dataframe version (run_simulations5): \n", "Threads.nthreads() = 12\n", " 714.319 ms (760130 allocations: 2.64 GiB)\n", "\n", "Optimized non-dataframe version (single thread): \n", " 869.904 ms (160017 allocations: 5.90 MiB)\n", "\n", "Multi-thread optimized non-dataframe version: \n", "Threads.nthreads() = 12\n", " 161.730 ms (160246 allocations: 6.97 MiB)\n", "\n" ] } ], "source": [ "n = 20000\n", "\n", "@show VERSION\n", "@show n\n", "println()\n", "\n", "println(\"Original version (with DataFrame, single thread):\")\n", "@btime run_simulations($n)\n", "println()\n", "\n", "println(\"Optimized version (with DataFrame, single thread):\")\n", "@btime run_simulations_optimized($n)\n", "println()\n", "\n", "println(\"Multi-thread optimized version (with DataFrame): \")\n", "@show Threads.nthreads()\n", "@btime run_simulations_multithread_optimized($n)\n", "println()\n", "\n", "println(\"Multi-thread non-dataframe version (run_simulations5): \")\n", "@show Threads.nthreads()\n", "@btime run_simulations5($n)\n", "println()\n", "\n", "println(\"Optimized non-dataframe version (single thread): \")\n", "@btime run_simulations_optimized_nondataframe($n)\n", "println()\n", "\n", "println(\"Multi-thread optimized non-dataframe version: \")\n", "@show Threads.nthreads()\n", "@btime run_simulations_multithread_optimized_nondataframe($n)\n", "println()" ] }, { "cell_type": "code", "execution_count": 25, "id": "a8d6c0b4-8c51-4183-827a-e78bc51ee381", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Julia Version 1.6.1\n", "Commit 6aaedecc44 (2021-04-23 05:59 UTC)\n", "Platform Info:\n", " OS: Windows (x86_64-w64-mingw32)\n", " CPU: Intel(R) Core(TM) i7-10750H CPU @ 2.60GHz\n", " WORD_SIZE: 64\n", " LIBM: libopenlibm\n", " LLVM: libLLVM-11.0.1 (ORCJIT, skylake)\n", "Environment:\n", " JULIA_DEPOT_PATH = D:\\.julia\n", " JULIA_NUM_THREADS = 12\n", " JULIA_PYTHONCALL_EXE = D:\\.julia\\conda\\3\\python.exe\n" ] } ], "source": [ "versioninfo()" ] }, { "cell_type": "code", "execution_count": null, "id": "e930f102-ec6a-4733-af04-2c3dc7d8f9e2", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "jupytext": { "formats": "ipynb,jl:hydrogen" }, "kernelspec": { "display_name": "Julia 1.6.1", "language": "julia", "name": "julia-1.6" }, "language_info": { "file_extension": ".jl", "mimetype": "application/julia", "name": "julia", "version": "1.6.1" } }, "nbformat": 4, "nbformat_minor": 5 }