{
"cells": [
{
"cell_type": "markdown",
"id": "d5fc9cae-a2f2-4c65-bc89-67dfe8d25381",
"metadata": {},
"source": [
"# Getting even faster by reducing allocations and by multi-threading\n",
"\n",
"* Gen Kuroki\n",
"* 2021-06-28"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "e86caf9d-95d3-4c28-84ff-03dbbe0f939e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"v\"1.6.1\""
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"VERSION"
]
},
{
"cell_type": "markdown",
"id": "19a7e753-1581-430b-a885-59744332315a",
"metadata": {},
"source": [
"## Original version\n",
"\n",
"* https://twitter.com/elbersb/status/1409454534666207232\n",
"* https://elbersb.com/public/posts/interaction_simulation/"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "a8a639f2-ba1d-42e6-93d4-2b51ff273f64",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"gen_data (generic function with 1 method)"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"using Distributions \n",
"using DataFrames \n",
"\n",
"const std_normal = Normal(0, 1)\n",
"\n",
"function gen_data()\n",
" ## total time periods in the the panel = 500\n",
" tt = 500\n",
"\n",
" # x1 and x2 covariates\n",
" x1_A = 1 .+ rand(std_normal, tt)\n",
" x1_B = 1/4 .+ rand(std_normal, tt)\n",
" x2_A = 1 .+ x1_A .+ rand(std_normal, tt)\n",
" x2_B = 1 .+ x1_B .+ rand(std_normal, tt)\n",
"\n",
" # outcomes (notice different slope coefs for x2_A and x2_B)\n",
" y_A = x1_A .+ 1*x2_A + rand(std_normal, tt)\n",
" y_B = x1_B .+ 2*x2_B + rand(std_normal, tt)\n",
"\n",
" # combine\n",
" DataFrame(\n",
" id = vcat(fill(0, length(x1_A)), fill(1, length(x1_B))),\n",
" x1 = vcat(x1_A, x1_B),\n",
" x2 = vcat(x2_A, x2_B),\n",
" x1_dmean = vcat(x1_A .- mean(x1_A), x1_B .- mean(x1_B)),\n",
" x2_dmean = vcat(x2_A .- mean(x2_A), x2_B .- mean(x2_B)),\n",
" y = vcat(y_A, y_B))\n",
"end"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "2a00cf51-ad83-4f88-8101-e3cb27a68e2c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(-0.14620466773983154, -0.026738115648363484)"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"using GLM\n",
"\n",
"function coefs_lm_formula(data)\n",
" mod_level = lm(@formula(y ~ id + x1 * x2), data)\n",
" mod_dmean = lm(@formula(y ~ id + x1_dmean * x2_dmean), data)\n",
" (coef(mod_level)[end], coef(mod_dmean)[end])\n",
"end\n",
"\n",
"# example\n",
"data = gen_data()\n",
"coefs_lm_formula(data)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "d0c50465-d648-46df-9c6a-23bc9b216756",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 8.495 s (23260002 allocations: 16.87 GiB)\n"
]
}
],
"source": [
"function run_simulations(nsim)\n",
" sims = zeros(nsim, 2);\n",
" for i in 1:nsim\n",
" data = gen_data()\n",
" sims[i, :] .= coefs_lm_formula(data)\n",
" end\n",
" sims\n",
"end\n",
"\n",
"using BenchmarkTools\n",
"n = 20000\n",
"@btime run_simulations($n);"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "e2c5ab83-330f-4db3-8176-22e588b8aca4",
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"using Plots\n",
"sims = run_simulations(n)\n",
"histogram(sims, label = [\"level\" \"dmean\"])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "d605839d-e40d-4b86-b3f4-62211c1d6ed7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 3.372 s (2800002 allocations: 8.30 GiB)\n"
]
}
],
"source": [
"function coefs_lm_formula(data)\n",
" constant = fill(1, nrow(data))\n",
" X = Float64[constant data.id data.x1 data.x2 data.x1 .* data.x2]\n",
" mod_level = fit(LinearModel, X, data.y)\n",
" X[:, 5] .= data.x1_dmean .* data.x2_dmean\n",
" mod_dmean = fit(LinearModel, X, data.y)\n",
" (coef(mod_level)[end], coef(mod_dmean)[end])\n",
"end\n",
"\n",
"@btime run_simulations($n);"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "fca6149a-9d76-4e94-b306-40c3ae059eeb",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 2.191 s (2040002 allocations: 4.65 GiB)\n"
]
}
],
"source": [
"using LinearAlgebra \n",
"fastfit(X, y) = cholesky!(Symmetric(X' * X)) \\ (X' * y)\n",
"\n",
"function coefs_lm_formula(data)\n",
" constant = fill(1, nrow(data))\n",
" X = Float64[constant data.id data.x1 data.x2 data.x1 .* data.x2]\n",
" mod_level = fastfit(X, data.y)\n",
" X[:, 5] .= data.x1_dmean .* data.x2_dmean\n",
" mod_dmean = fastfit(X, data.y)\n",
" (mod_level[end], mod_dmean[end])\n",
"end\n",
"\n",
"@btime run_simulations($n);"
]
},
{
"cell_type": "markdown",
"id": "da8e5a2d-ee9f-4e1d-9d1a-04016b1bbae1",
"metadata": {},
"source": [
"## Optimized version: Reducing allocations\n",
"\n",
"Ref. [Performance Tips: Pre-allocating outputs](https://docs.julialang.org/en/v1/manual/performance-tips/#Pre-allocating-outputs)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "62c9e980-50c5-45fc-b1e6-9897721f0c57",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"n = 20000 = 20000\n",
" 1.841701 seconds (4.02 M allocations: 212.852 MiB, 2.88% gc time, 45.74% compilation time)\n"
]
},
{
"data": {
"image/svg+xml": [
"\n",
"\n"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"using DataFrames\n",
"using Distributions\n",
"using LinearAlgebra\n",
"using Random: rand!\n",
"using BenchmarkTools\n",
"using Plots\n",
"\n",
"const std_normal = Normal(0, 1)\n",
"\n",
"function undef_data(tt)\n",
" id = vcat(fill(0, tt), fill(1, tt))\n",
" x1 = similar(id, Float64)\n",
" x2 = similar(x1)\n",
" x1_dmean = similar(x1)\n",
" x2_dmean = similar(x2)\n",
" y = similar(x1)\n",
" DataFrame(id = id, x1 = x1, x2 = x2, x1_dmean = x1_dmean, x2_dmean = x2_dmean, y = y)\n",
"end\n",
"\n",
"function gen_data!(data, tmp)\n",
" tt = length(tmp) # Assume 2tt = nrow(data)\n",
" A, B = 1:tt, tt+1:2tt\n",
" \n",
" # Warning: dataframe is type-unstable\n",
" x1::Vector{Float64} = data.x1\n",
" x2::Vector{Float64} = data.x2\n",
" x1_dmean::Vector{Float64} = data.x1_dmean\n",
" x2_dmean::Vector{Float64} = data.x2_dmean\n",
" y::Vector{Float64} = data.y\n",
" \n",
" @views x1[A] .= 1 .+ rand!(std_normal, tmp)\n",
" @views x1[B] .= 1/4 .+ rand!(std_normal, tmp)\n",
" @views x2[A] .= 1 .+ x1[A] .+ rand!(std_normal, tmp)\n",
" @views x2[B] .= 1 .+ x1[B] .+ rand!(std_normal, tmp)\n",
"\n",
" # outcomes (notice different slope coefs for x2[A] and x2[B])\n",
" @views y[A] .= x1[A] .+ 1 .* x2[A] .+ rand!(std_normal, tmp)\n",
" @views y[B] .= x1[B] .+ 2 .* x2[B] .+ rand!(std_normal, tmp)\n",
" \n",
" @views x1_dmean[A] .= x1[A] .- mean(x1[A])\n",
" @views x1_dmean[B] .= x1[B] .- mean(x1[B])\n",
" @views x2_dmean[A] .= x2[A] .- mean(x2[A])\n",
" @views x2_dmean[B] .= x2[B] .- mean(x2[B])\n",
" \n",
" data\n",
"end\n",
"\n",
"function fastfit!(mod, X, y, XtX, Xty)\n",
" mul!(XtX, X', X)\n",
" mul!(Xty, X', y)\n",
" ldiv!(mod, cholesky!(Symmetric(XtX)), Xty)\n",
"end\n",
"\n",
"function coefs_lm_formula!(result, data, X, mod_level, mod_dmean, XtX, Xty)\n",
" # Warning: dataframe is type-unstable.\n",
" id::Vector{Int} = data.id\n",
" x1::Vector{Float64} = data.x1\n",
" x2::Vector{Float64} = data.x2\n",
" x1_dmean::Vector{Float64} = data.x1_dmean\n",
" x2_dmean::Vector{Float64} = data.x2_dmean\n",
" y::Vector{Float64} = data.y\n",
" \n",
" X[:, 1] .= 1\n",
" X[:, 2] .= id\n",
" X[:, 3] .= x1\n",
" X[:, 4] .= x2\n",
" X[:, 5] .= x1 .* x2\n",
" fastfit!(mod_level, X, y, XtX, Xty)\n",
"\n",
" X[:, 5] .= x1_dmean .* x2_dmean\n",
" fastfit!(mod_dmean, X, y, XtX, Xty)\n",
" \n",
" result .= (mod_level[end], mod_dmean[end])\n",
"end\n",
"\n",
"function run_simulations_optimized(nsim, tt=500)\n",
" sims = Matrix{Float64}(undef, nsim, 2)\n",
" data = undef_data(tt)\n",
" tmp = Vector{Float64}(undef, tt)\n",
" X = Matrix{Float64}(undef, 2tt, 5)\n",
" mod_level = Vector{Float64}(undef, 5)\n",
" mod_dmean = similar(mod_level)\n",
" XtX = Matrix{Float64}(undef, 5, 5)\n",
" Xty = Vector{Float64}(undef, 5)\n",
" for i in 1:nsim\n",
" gen_data!(data, tmp)\n",
" @views coefs_lm_formula!(sims[i, :], data, X, mod_level, mod_dmean, XtX, Xty)\n",
" end\n",
" sims\n",
"end\n",
"\n",
"@show n = 20000\n",
"@time sims = run_simulations_optimized(n)\n",
"histogram(sims, label = [\"level\" \"dmean\"])"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "5d44270c-3e3c-4b7d-bca0-e8dab8765fef",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Optimized version:\n",
" 27.100 μs (0 allocations: 0 bytes)\n",
" 14.700 μs (8 allocations: 288 bytes)\n",
" 907.061 ms (160062 allocations: 5.94 MiB)\n"
]
}
],
"source": [
"tt = 500\n",
"data = undef_data(tt)\n",
"tmp = Vector{Float64}(undef, tt)\n",
"X = Matrix{Float64}(undef, 2tt, 5)\n",
"mod_level = Vector{Float64}(undef, 5)\n",
"mod_dmean = similar(mod_level)\n",
"XtX = Matrix{Float64}(undef, 5, 5)\n",
"Xty = Vector{Float64}(undef, 5)\n",
"sim = zeros(2)\n",
"n = 20000\n",
"\n",
"println(\"Optimized version:\")\n",
"@btime gen_data!($data, $tmp)\n",
"@btime coefs_lm_formula!($sim, $data, $X, $mod_level, $mod_dmean, $XtX, $Xty)\n",
"@btime run_simulations_optimized($n);"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "56711e76-1862-4902-a727-e0e4d9cf220a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Original version:\n",
" 41.000 μs (73 allocations: 187.31 KiB)\n",
" 21.100 μs (26 allocations: 56.52 KiB)\n",
" 2.252 s (2040002 allocations: 4.65 GiB)\n"
]
}
],
"source": [
"println(\"Original version:\")\n",
"@btime gen_data()\n",
"@btime coefs_lm_formula($data)\n",
"@btime run_simulations($n);"
]
},
{
"cell_type": "markdown",
"id": "9661eea1-be1a-4562-b435-f9fa52a0f2e6",
"metadata": {},
"source": [
"## Multi-thread original version"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "2b34fb4e-e8f4-4fc7-a008-f2f3fdf11829",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Threads.nthreads() = 12\n",
" 1.802908 seconds (2.14 M allocations: 4.659 GiB, 40.37% gc time, 1.47% compilation time)\n"
]
},
{
"data": {
"image/svg+xml": [
"\n",
"\n"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"function run_simulations_multithread(nsim)\n",
" sims = zeros(nsim, 2);\n",
" Threads.@threads for i in 1:nsim\n",
" data = gen_data()\n",
" sims[i, :] .= coefs_lm_formula(data)\n",
" end\n",
" sims\n",
"end\n",
"\n",
"@show Threads.nthreads()\n",
"@time sims = run_simulations_multithread(n)\n",
"histogram(sims, label = [\"level\" \"dmean\"])"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "a8b60be5-3565-4288-9f71-ee184b765d52",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Multi-thread original version: \n",
"Threads.nthreads() = 12\n",
" 1.527 s (2040068 allocations: 4.65 GiB)\n"
]
}
],
"source": [
"println(\"Multi-thread original version: \")\n",
"@show Threads.nthreads()\n",
"@btime run_simulations_multithread($n);"
]
},
{
"cell_type": "markdown",
"id": "5b3146d8-266f-441c-97c6-e0f1d4b62fa1",
"metadata": {},
"source": [
"## Multi-thread optimized version"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "993f4daa-98c6-46f2-a8ca-8a1fe22e1f84",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Threads.nthreads() = 12\n",
" 0.204476 seconds (229.28 k allocations: 11.634 MiB, 5.18% compilation time)\n"
]
},
{
"data": {
"image/svg+xml": [
"\n",
"\n"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# pkg> dev https://github.com/genkuroki/MyUtils.jl\n",
"using MyUtils: @my_threads\n",
"\n",
"function run_simulations_multithread_optimized(nsim, tt=500)\n",
" sims = Matrix{Float64}(undef, nsim, 2)\n",
" @my_threads begin\n",
" data = undef_data(tt)\n",
" tmp = Vector{Float64}(undef, tt)\n",
" X = Matrix{Float64}(undef, 2tt, 5)\n",
" mod_level = Vector{Float64}(undef, 5)\n",
" mod_dmean = similar(mod_level)\n",
" XtX = Matrix{Float64}(undef, 5, 5)\n",
" Xty = Vector{Float64}(undef, 5)\n",
" end for i in 1:nsim\n",
" gen_data!(data, tmp)\n",
" @views coefs_lm_formula!(sims[i, :], data, X, mod_level, mod_dmean, XtX, Xty)\n",
" end begin\n",
" end\n",
" sims\n",
"end\n",
"\n",
"@show Threads.nthreads()\n",
"@time sims = run_simulations_multithread_optimized(n)\n",
"histogram(sims, label = [\"level\" \"dmean\"])"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "08864d8f-acc3-4cdb-b883-2be11e9142cf",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Multi-thread optimized version: \n",
"Threads.nthreads() = 12\n",
" 169.589 ms (160786 allocations: 7.56 MiB)\n"
]
}
],
"source": [
"println(\"Multi-thread optimized version: \")\n",
"@show Threads.nthreads()\n",
"@btime run_simulations_multithread_optimized($n);"
]
},
{
"cell_type": "markdown",
"id": "7ac1b051-a13e-4e48-a341-2d62f64b2293",
"metadata": {},
"source": [
"## non-DataFrame version\n",
"\n",
"https://discourse.julialang.org/t/simulation-of-regression-coefficients/63683/2"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "ee3f65e4-e49c-400b-afbb-c9130381b3f2",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"run_simulations5 (generic function with 1 method)"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"using Distributions\n",
"using LinearAlgebra\n",
"using BenchmarkTools\n",
"using Plots\n",
"using Base.Threads\n",
"\n",
"const std_normal = Normal(0, 1)\n",
"fastfit(X, y) = cholesky!(Symmetric(X' * X)) \\ (X' * y)\n",
"\n",
"function gen_data2!(X, y, x1_dmean, x2_dmean)\n",
" ## total time periods in the the panel = 500\n",
" tt = 500\n",
"\n",
" # x1 and x2 covariates\n",
" x1_A = 1 .+ rand(std_normal, tt)\n",
" x1_B = 1/4 .+ rand(std_normal, tt)\n",
" x2_A = 1 .+ x1_A .+ rand(std_normal, tt)\n",
" x2_B = 1 .+ x1_B .+ rand(std_normal, tt)\n",
"\n",
" # outcomes (notice different slope coefs for x2_A and x2_B)\n",
" y_A = x1_A .+ 1*x2_A + rand(std_normal, tt)\n",
" y_B = x1_B .+ 2*x2_B + rand(std_normal, tt)\n",
" x1 = vcat(x1_A, x1_B)\n",
" x2 = vcat(x2_A, x2_B)\n",
"\n",
" X[:, 2] .= vcat(fill(0, length(x1_A)), fill(1, length(x1_B)))\n",
" X[:, 3] .= x1\n",
" X[:, 4] .= x2\n",
" X[:, 5] .= (x1 .* x2)\n",
"\n",
" y .= vcat(y_A, y_B)\n",
"\n",
" x1_dmean .= vcat(x1_A .- mean(x1_A), x1_B .- mean(x1_B))\n",
" x2_dmean .= vcat(x2_A .- mean(x2_A), x2_B .- mean(x2_B))\n",
"end\n",
"\n",
"function coefs_lm_formula4(X, y, x1_dmean, x2_dmean)\n",
" mod_level = fastfit(X, y)\n",
" @inbounds X[:, 5] .= x1_dmean .* x2_dmean\n",
" mod_dmean = fastfit(X, y)\n",
" (mod_level[end], mod_dmean[end])\n",
"end\n",
"\n",
"function run_simulations5(nsim)\n",
" sims = zeros(nsim, 2);\n",
" X = [zeros(Float64, 1000, 5) for _ in 1:nthreads()]\n",
" y = [zeros(Float64, 1000) for _ in 1:nthreads()]\n",
" x1_dmean = [zeros(Float64, 1000) for _ in 1:nthreads()]\n",
" x2_dmean = [zeros(Float64, 1000) for _ in 1:nthreads()]\n",
"\n",
" # set constant\n",
" for i in 1:nthreads()\n",
" X[i][:, 1] .= 1\n",
" end\n",
"\n",
" @threads for i in 1:nsim\n",
" gen_data2!(X[threadid()], y[threadid()], x1_dmean[threadid()], x2_dmean[threadid()])\n",
" @inbounds sims[i, 1], sims[i, 2] = coefs_lm_formula4(X[threadid()], y[threadid()], x1_dmean[threadid()], x2_dmean[threadid()])\n",
" end\n",
" sims\n",
"end"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "e9639558-5216-45e1-b1bc-974988c56c59",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Threads.nthreads() = 12\n",
" 1.447246 seconds (2.48 M allocations: 2.726 GiB, 24.63% gc time, 10.16% compilation time)\n"
]
},
{
"data": {
"image/svg+xml": [
"\n",
"\n"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"n = 20000\n",
"@show Threads.nthreads()\n",
"@time sims = run_simulations5(n)\n",
"histogram(sims, label = [\"level\" \"dmean\"])"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "2d3fc781-986d-4d90-9dad-264284d3c2b6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Multi-thread non-dataframe version (run_simulations5): \n",
"Threads.nthreads() = 12\n",
" 664.780 ms (760131 allocations: 2.64 GiB)\n"
]
}
],
"source": [
"println(\"Multi-thread non-dataframe version (run_simulations5): \")\n",
"@show Threads.nthreads()\n",
"@btime run_simulations5($n);"
]
},
{
"cell_type": "markdown",
"id": "e21456c5-441a-4382-aa20-67d05ba3bb08",
"metadata": {},
"source": [
"## Optimized non-DataFrame version"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "ebd9b54e-da8d-46fd-a6df-b4553a80f122",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"n = 20000 = 20000\n",
" 1.514930 seconds (2.50 M allocations: 128.396 MiB, 3.07% gc time, 35.76% compilation time)\n"
]
},
{
"data": {
"image/svg+xml": [
"\n",
"\n"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"using Distributions\n",
"using LinearAlgebra\n",
"using Random: rand!\n",
"using BenchmarkTools\n",
"using Plots\n",
"\n",
"const std_normal = Normal(0, 1)\n",
"\n",
"# use NamedTuple indtead of DataFrame\n",
"function undef_data_namedtuple(tt)\n",
" id = vcat(fill(0, tt), fill(1, tt))\n",
" x1 = similar(id, Float64)\n",
" x2 = similar(x1)\n",
" x1_dmean = similar(x1)\n",
" x2_dmean = similar(x2)\n",
" y = similar(x1)\n",
" (; id, x1, x2, x1_dmean, x2_dmean, y)\n",
"end\n",
"\n",
"function gen_data!(data, tmp)\n",
" tt = length(tmp) # Assume 2tt = nrow(data)\n",
" A, B = 1:tt, tt+1:2tt\n",
" \n",
" # Warning: dataframe is type-unstable\n",
" x1::Vector{Float64} = data.x1\n",
" x2::Vector{Float64} = data.x2\n",
" x1_dmean::Vector{Float64} = data.x1_dmean\n",
" x2_dmean::Vector{Float64} = data.x2_dmean\n",
" y::Vector{Float64} = data.y\n",
" \n",
" @views x1[A] .= 1 .+ rand!(std_normal, tmp)\n",
" @views x1[B] .= 1/4 .+ rand!(std_normal, tmp)\n",
" @views x2[A] .= 1 .+ x1[A] .+ rand!(std_normal, tmp)\n",
" @views x2[B] .= 1 .+ x1[B] .+ rand!(std_normal, tmp)\n",
"\n",
" # outcomes (notice different slope coefs for x2[A] and x2[B])\n",
" @views y[A] .= x1[A] .+ 1 .* x2[A] .+ rand!(std_normal, tmp)\n",
" @views y[B] .= x1[B] .+ 2 .* x2[B] .+ rand!(std_normal, tmp)\n",
" \n",
" @views x1_dmean[A] .= x1[A] .- mean(x1[A])\n",
" @views x1_dmean[B] .= x1[B] .- mean(x1[B])\n",
" @views x2_dmean[A] .= x2[A] .- mean(x2[A])\n",
" @views x2_dmean[B] .= x2[B] .- mean(x2[B])\n",
" \n",
" data\n",
"end\n",
"\n",
"function fastfit!(mod, X, y, XtX, Xty)\n",
" mul!(XtX, X', X)\n",
" mul!(Xty, X', y)\n",
" ldiv!(mod, cholesky!(Symmetric(XtX)), Xty)\n",
"end\n",
"\n",
"function coefs_lm_formula!(result, data, X, mod_level, mod_dmean, XtX, Xty)\n",
" # Warning: dataframe is type-unstable.\n",
" id::Vector{Int} = data.id\n",
" x1::Vector{Float64} = data.x1\n",
" x2::Vector{Float64} = data.x2\n",
" x1_dmean::Vector{Float64} = data.x1_dmean\n",
" x2_dmean::Vector{Float64} = data.x2_dmean\n",
" y::Vector{Float64} = data.y\n",
" \n",
" X[:, 1] .= 1\n",
" X[:, 2] .= id\n",
" X[:, 3] .= x1\n",
" X[:, 4] .= x2\n",
" X[:, 5] .= x1 .* x2\n",
" fastfit!(mod_level, X, y, XtX, Xty)\n",
"\n",
" X[:, 5] .= x1_dmean .* x2_dmean\n",
" fastfit!(mod_dmean, X, y, XtX, Xty)\n",
" \n",
" result .= (mod_level[end], mod_dmean[end])\n",
"end\n",
"\n",
"function run_simulations_optimized_nondataframe(nsim, tt=500)\n",
" sims = Matrix{Float64}(undef, nsim, 2)\n",
" data = undef_data_namedtuple(tt) # use NamedTuple indtead of DataFrame\n",
" tmp = Vector{Float64}(undef, tt)\n",
" X = Matrix{Float64}(undef, 2tt, 5)\n",
" mod_level = Vector{Float64}(undef, 5)\n",
" mod_dmean = similar(mod_level)\n",
" XtX = Matrix{Float64}(undef, 5, 5)\n",
" Xty = Vector{Float64}(undef, 5)\n",
" for i in 1:nsim\n",
" gen_data!(data, tmp)\n",
" @views coefs_lm_formula!(sims[i, :], data, X, mod_level, mod_dmean, XtX, Xty)\n",
" end\n",
" sims\n",
"end\n",
"\n",
"@show n = 20000\n",
"@time sims = run_simulations_optimized_nondataframe(n)\n",
"histogram(sims, label = [\"level\" \"dmean\"])"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "769c9917-0f7e-492d-a793-f62999fc6a34",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Optimized non-dataframe version (single thread): \n",
" 885.032 ms (160017 allocations: 5.90 MiB)\n"
]
}
],
"source": [
"println(\"Optimized non-dataframe version (single thread): \")\n",
"@btime run_simulations_optimized_nondataframe($n);"
]
},
{
"cell_type": "markdown",
"id": "2ee0a401-b16d-4da3-9551-7ebbcdfcd563",
"metadata": {},
"source": [
"## Multi-thread optimized non-DataFrame version"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "ce74687b-be7c-4e86-bc38-558d184e8317",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Threads.nthreads() = 12\n",
" 0.211943 seconds (228.71 k allocations: 11.044 MiB, 6.34% compilation time)\n"
]
},
{
"data": {
"image/svg+xml": [
"\n",
"\n"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# pkg> dev https://github.com/genkuroki/MyUtils.jl\n",
"using MyUtils: @my_threads\n",
"\n",
"function run_simulations_multithread_optimized_nondataframe(nsim, tt=500)\n",
" sims = Matrix{Float64}(undef, nsim, 2)\n",
" @my_threads begin\n",
" data = undef_data_namedtuple(tt)\n",
" tmp = Vector{Float64}(undef, tt)\n",
" X = Matrix{Float64}(undef, 2tt, 5)\n",
" mod_level = Vector{Float64}(undef, 5)\n",
" mod_dmean = similar(mod_level)\n",
" XtX = Matrix{Float64}(undef, 5, 5)\n",
" Xty = Vector{Float64}(undef, 5)\n",
" end for i in 1:nsim\n",
" gen_data!(data, tmp)\n",
" @views coefs_lm_formula!(sims[i, :], data, X, mod_level, mod_dmean, XtX, Xty)\n",
" end begin\n",
" end\n",
" sims\n",
"end\n",
"\n",
"@show Threads.nthreads()\n",
"@time sims = run_simulations_multithread_optimized_nondataframe(n)\n",
"histogram(sims, label = [\"level\" \"dmean\"])"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "2ee53979-a586-405e-8cac-4d04d5c86760",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Multi-thread optimized non-dataframe version: \n",
"Threads.nthreads() = 12\n",
" 163.337 ms (160247 allocations: 6.97 MiB)\n"
]
}
],
"source": [
"println(\"Multi-thread optimized non-dataframe version: \")\n",
"@show Threads.nthreads()\n",
"@btime run_simulations_multithread_optimized_nondataframe($n);"
]
},
{
"cell_type": "markdown",
"id": "f842c2ca-152b-4ed3-978a-f4bc54da3320",
"metadata": {},
"source": [
"## Summary"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "67e7afe6-c3a8-404e-b4a7-fa9f38397fd3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"VERSION = v\"1.6.1\"\n",
"n = 20000\n",
"\n",
"Original version (with DataFrame, single thread):\n",
" 1.836 s (2040002 allocations: 4.65 GiB)\n",
"\n",
"Optimized version (with DataFrame, single thread):\n",
" 918.817 ms (160062 allocations: 5.94 MiB)\n",
"\n",
"Multi-thread optimized version (with DataFrame): \n",
"Threads.nthreads() = 12\n",
" 168.350 ms (160786 allocations: 7.56 MiB)\n",
"\n",
"Multi-thread non-dataframe version (run_simulations5): \n",
"Threads.nthreads() = 12\n",
" 714.319 ms (760130 allocations: 2.64 GiB)\n",
"\n",
"Optimized non-dataframe version (single thread): \n",
" 869.904 ms (160017 allocations: 5.90 MiB)\n",
"\n",
"Multi-thread optimized non-dataframe version: \n",
"Threads.nthreads() = 12\n",
" 161.730 ms (160246 allocations: 6.97 MiB)\n",
"\n"
]
}
],
"source": [
"n = 20000\n",
"\n",
"@show VERSION\n",
"@show n\n",
"println()\n",
"\n",
"println(\"Original version (with DataFrame, single thread):\")\n",
"@btime run_simulations($n)\n",
"println()\n",
"\n",
"println(\"Optimized version (with DataFrame, single thread):\")\n",
"@btime run_simulations_optimized($n)\n",
"println()\n",
"\n",
"println(\"Multi-thread optimized version (with DataFrame): \")\n",
"@show Threads.nthreads()\n",
"@btime run_simulations_multithread_optimized($n)\n",
"println()\n",
"\n",
"println(\"Multi-thread non-dataframe version (run_simulations5): \")\n",
"@show Threads.nthreads()\n",
"@btime run_simulations5($n)\n",
"println()\n",
"\n",
"println(\"Optimized non-dataframe version (single thread): \")\n",
"@btime run_simulations_optimized_nondataframe($n)\n",
"println()\n",
"\n",
"println(\"Multi-thread optimized non-dataframe version: \")\n",
"@show Threads.nthreads()\n",
"@btime run_simulations_multithread_optimized_nondataframe($n)\n",
"println()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "a8d6c0b4-8c51-4183-827a-e78bc51ee381",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Julia Version 1.6.1\n",
"Commit 6aaedecc44 (2021-04-23 05:59 UTC)\n",
"Platform Info:\n",
" OS: Windows (x86_64-w64-mingw32)\n",
" CPU: Intel(R) Core(TM) i7-10750H CPU @ 2.60GHz\n",
" WORD_SIZE: 64\n",
" LIBM: libopenlibm\n",
" LLVM: libLLVM-11.0.1 (ORCJIT, skylake)\n",
"Environment:\n",
" JULIA_DEPOT_PATH = D:\\.julia\n",
" JULIA_NUM_THREADS = 12\n",
" JULIA_PYTHONCALL_EXE = D:\\.julia\\conda\\3\\python.exe\n"
]
}
],
"source": [
"versioninfo()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e930f102-ec6a-4733-af04-2c3dc7d8f9e2",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"jupytext": {
"formats": "ipynb,jl:hydrogen"
},
"kernelspec": {
"display_name": "Julia 1.6.1",
"language": "julia",
"name": "julia-1.6"
},
"language_info": {
"file_extension": ".jl",
"mimetype": "application/julia",
"name": "julia",
"version": "1.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 5
}