{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "gemm_pji! (generic function with 1 method)" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Simple three-nested-loop gemm implementations in Julia, all loop orderings\n", "function gemm_ijp!(C, A, B)\n", " for i in 1:size(C, 1),\n", " j in 1:size(C, 2),\n", " p in 1:size(A, 2)\n", " @inbounds C[i, j] += A[i, p] * B[p, j]\n", " end\n", " return C\n", "end\n", "function gemm_ipj!(C, A, B)\n", " for i in 1:size(C, 1),\n", " p in 1:size(A, 2),\n", " j in 1:size(C, 2)\n", " @inbounds C[i, j] += A[i, p] * B[p, j]\n", " end\n", " return C\n", "end\n", "function gemm_jip!(C, A, B)\n", " for j in 1:size(C, 2),\n", " i in 1:size(C, 1),\n", " p in 1:size(A, 2)\n", " @inbounds C[i, j] += A[i, p] * B[p, j]\n", " end\n", " return C\n", "end\n", "function gemm_jpi!(C, A, B)\n", " for j in 1:size(C, 2),\n", " p in 1:size(A, 2),\n", " i in 1:size(C, 1)\n", " @inbounds C[i, j] += A[i, p] * B[p, j]\n", " end\n", " return C\n", "end\n", "function gemm_pij!(C, A, B)\n", " for p in 1:size(A, 2),\n", " i in 1:size(C, 1),\n", " j in 1:size(C, 2)\n", " @inbounds C[i, j] += A[i, p] * B[p, j]\n", " end\n", " return C\n", "end\n", "function gemm_pji!(C, A, B)\n", " for p in 1:size(A, 2),\n", " j in 1:size(C, 2),\n", " i in 1:size(C, 1)\n", " @inbounds C[i, j] += A[i, p] * B[p, j]\n", " end\n", " return C\n", "end" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "gemm_pji! (generic function with 1 method)" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# In Julia writing code that writes code is easy,\n", "# so here we write a little macro to generate\n", "# the above methods for us! :)\n", "macro def_gemm_xyz!(xyz)\n", " x, y, z = (Symbol(q) for q in String(xyz))\n", "\n", " function indexrange(q)\n", " if q == :i return :(1:size(C, 1)) end\n", " if q == :j return :(1:size(C, 2)) end\n", " if q == :p return :(1:size(A, 2)) end\n", " end\n", "\n", " return quote\n", " function $(esc(Symbol(:gemm_, xyz, :!)))(C, A, B)\n", " for $(x) in $(indexrange(x)),\n", " $(y) in $(indexrange(y)),\n", " $(z) in $(indexrange(z))\n", " @inbounds C[i, j] += A[i, p] * B[p, j]\n", " end\n", " return C\n", " end\n", " end\n", "end\n", "@def_gemm_xyz! ijp\n", "@def_gemm_xyz! ipj\n", "@def_gemm_xyz! jip\n", "@def_gemm_xyz! jpi\n", "@def_gemm_xyz! pij\n", "@def_gemm_xyz! pji" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# The same definitions in C, as a string which we compile below\n", "#\n", "# We could programmatically generate these strings as well,\n", "# and likewise with the wrappers below and other repeated code,\n", "# but perhaps simple and explicit is best for now.\n", "gemm_xyz_ccode = \"\"\"\n", "#define gamma(i,j) C[j*m + i]\n", "#define alpha(i,p) A[p*m + i]\n", "#define beta(p,j) B[j*k + p]\n", "void gemm_ijp_c(double* C, double* A, double* B, int m, int k, int n) {\n", " for ( int i = 0; i < m; ++i )\n", " for ( int j = 0; j < n; ++j )\n", " for ( int p = 0; p < k; ++p )\n", " gamma(i, j) += alpha(i, p) * beta(p, j);\n", "}\n", "void gemm_ipj_c(double* C, double* A, double* B, int m, int k, int n) {\n", " for ( int i = 0; i < m; ++i )\n", " for ( int p = 0; p < k; ++p )\n", " for ( int j = 0; j < n; ++j )\n", " gamma(i, j) += alpha(i, p) * beta(p, j);\n", "}\n", "void gemm_jip_c(double* C, double* A, double* B, int m, int k, int n) {\n", " for ( int j = 0; j < n; ++j )\n", " for ( int i = 0; i < m; ++i )\n", " for ( int p = 0; p < k; ++p )\n", " gamma(i, j) += alpha(i, p) * beta(p, j);\n", "}\n", "void gemm_jpi_c(double* C, double* A, double* B, int m, int k, int n) {\n", " for ( int j = 0; j < n; ++j )\n", " for ( int p = 0; p < k; ++p )\n", " for ( int i = 0; i < m; ++i )\n", " gamma(i, j) += alpha(i, p) * beta(p, j);\n", "}\n", "void gemm_pij_c(double* C, double* A, double* B, int m, int k, int n) {\n", " for ( int p = 0; p < k; ++p )\n", " for ( int i = 0; i < m; ++i )\n", " for ( int j = 0; j < n; ++j )\n", " gamma(i, j) += alpha(i, p) * beta(p, j);\n", "}\n", "void gemm_pji_c(double* C, double* A, double* B, int m, int k, int n) {\n", " for ( int p = 0; p < k; ++p )\n", " for ( int j = 0; j < n; ++j )\n", " for ( int i = 0; i < m; ++i )\n", " gamma(i, j) += alpha(i, p) * beta(p, j);\n", "}\n", "\"\"\";" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# Compile!\n", "using Libdl # standard library package\n", "const CGemmLib = tempname()\n", "open(`gcc -fPIC -O3 -march=native -xc -shared -o $(CGemmLib * \".\" * Libdl.dlext) -`, \"w\") do f\n", " print(f, gemm_xyz_ccode) \n", "end" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "gemm_pji_c! (generic function with 1 method)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Wrap the C implementations as Julia functions!\n", "gemm_ijp_c!(C::Matrix{Float64}, A::Matrix{Float64}, B::Matrix{Float64}) =\n", " (ccall((\"gemm_ijp_c\", CGemmLib), Cvoid, (Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Cint, Cint, Cint), C, A, B, size(C, 1), size(A, 2), size(C, 2)); return C)\n", "gemm_ipj_c!(C::Matrix{Float64}, A::Matrix{Float64}, B::Matrix{Float64}) =\n", " (ccall((\"gemm_ipj_c\", CGemmLib), Cvoid, (Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Cint, Cint, Cint), C, A, B, size(C, 1), size(A, 2), size(C, 2)); return C)\n", "gemm_jip_c!(C::Matrix{Float64}, A::Matrix{Float64}, B::Matrix{Float64}) =\n", " (ccall((\"gemm_jip_c\", CGemmLib), Cvoid, (Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Cint, Cint, Cint), C, A, B, size(C, 1), size(A, 2), size(C, 2)); return C)\n", "gemm_jpi_c!(C::Matrix{Float64}, A::Matrix{Float64}, B::Matrix{Float64}) =\n", " (ccall((\"gemm_jpi_c\", CGemmLib), Cvoid, (Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Cint, Cint, Cint), C, A, B, size(C, 1), size(A, 2), size(C, 2)); return C)\n", "gemm_pij_c!(C::Matrix{Float64}, A::Matrix{Float64}, B::Matrix{Float64}) =\n", " (ccall((\"gemm_pij_c\", CGemmLib), Cvoid, (Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Cint, Cint, Cint), C, A, B, size(C, 1), size(A, 2), size(C, 2)); return C)\n", "gemm_pji_c!(C::Matrix{Float64}, A::Matrix{Float64}, B::Matrix{Float64}) =\n", " (ccall((\"gemm_pji_c\", CGemmLib), Cvoid, (Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Cint, Cint, Cint), C, A, B, size(C, 1), size(A, 2), size(C, 2)); return C)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\u001b[32m\u001b[1mTest Passed\u001b[22m\u001b[39m" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Briefly check correctness\n", "using Test # standard library package\n", "let\n", " m, n, k = 48*3, 48*2, 48\n", " C = rand(m, n)\n", " A = rand(m, k)\n", " B = rand(k, n)\n", " Cref = A * B\n", " @test gemm_ijp!(fill!(C, 0), A, B) ≈ Cref\n", " @test gemm_ipj!(fill!(C, 0), A, B) ≈ Cref\n", " @test gemm_jip!(fill!(C, 0), A, B) ≈ Cref\n", " @test gemm_jpi!(fill!(C, 0), A, B) ≈ Cref\n", " @test gemm_pij!(fill!(C, 0), A, B) ≈ Cref\n", " @test gemm_pji!(fill!(C, 0), A, B) ≈ Cref\n", " @test gemm_ijp_c!(fill!(C, 0), A, B) ≈ Cref\n", " @test gemm_ipj_c!(fill!(C, 0), A, B) ≈ Cref\n", " @test gemm_jip_c!(fill!(C, 0), A, B) ≈ Cref\n", " @test gemm_jpi_c!(fill!(C, 0), A, B) ≈ Cref\n", " @test gemm_pij_c!(fill!(C, 0), A, B) ≈ Cref\n", " @test gemm_pji_c!(fill!(C, 0), A, B) ≈ Cref\n", "end" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m\u001b[1m Resolving\u001b[22m\u001b[39m package versions...\n", "\u001b[32m\u001b[1m Updating\u001b[22m\u001b[39m `~/.julia/environments/v1.0/Project.toml`\n", "\u001b[90m [no changes]\u001b[39m\n", "\u001b[32m\u001b[1m Updating\u001b[22m\u001b[39m `~/.julia/environments/v1.0/Manifest.toml`\n", "\u001b[90m [no changes]\u001b[39m\n" ] } ], "source": [ "# Benchmarking time! :D\n", "#\n", "# We will use BenchmarkTools.jl, which makes statisically rigorous benchmarking easy.\n", "# It's not a standard library package, so we need to use the package manager (Pkg,\n", "# which is a standard library package, and so need not be explicitly installed)\n", "# to install (\"add\") BenchmarkTools.jl prior to `using BenchmarkTools`\n", "using Pkg; Pkg.add(\"BenchmarkTools\")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Benchmarking with 48-by-48 matrices... 18.042159 seconds (6.14 M allocations: 300.416 MiB, 25.94% gc time)\n", "Benchmarking with 144-by-144 matrices... 18.939694 seconds (4.67 M allocations: 231.948 MiB, 25.05% gc time)\n", "Benchmarking with 240-by-240 matrices... 19.253167 seconds (4.64 M allocations: 230.529 MiB, 23.41% gc time)\n", "Benchmarking with 336-by-336 matrices... 21.535816 seconds (4.64 M allocations: 230.290 MiB, 22.29% gc time)\n", "Benchmarking with 432-by-432 matrices... 22.547858 seconds (4.64 M allocations: 230.258 MiB, 20.34% gc time)\n", "Benchmarking with 528-by-528 matrices... 24.346598 seconds (4.64 M allocations: 230.339 MiB, 19.22% gc time)\n", "Benchmarking with 624-by-624 matrices... 31.795035 seconds (4.64 M allocations: 230.174 MiB, 14.98% gc time)\n", "Benchmarking with 720-by-720 matrices... 36.274252 seconds (4.64 M allocations: 230.204 MiB, 14.30% gc time)\n", "Benchmarking with 816-by-816 matrices... 74.278325 seconds (4.64 M allocations: 230.156 MiB, 7.03% gc time)\n", "Benchmarking with 912-by-912 matrices...132.907453 seconds (4.64 M allocations: 230.181 MiB, 3.98% gc time)\n" ] } ], "source": [ "# Now that we have BenchmarkTools.jl, we can begin with the benchmarks proper\n", "using BenchmarkTools\n", "\n", "# Constrain time that we allow BenchmarkTools.jl to collect samples for each benchmark\n", "BenchmarkTools.DEFAULT_PARAMETERS.seconds = 0.5\n", "\n", "mnks = 48*(1:2:20)\n", "foo = zeros(length(mnks))\n", "timings_ijp = copy(foo)\n", "timings_ipj = copy(foo)\n", "timings_jip = copy(foo)\n", "timings_jpi = copy(foo)\n", "timings_pij = copy(foo)\n", "timings_pji = copy(foo)\n", "timings_ijp_c = copy(foo)\n", "timings_ipj_c = copy(foo)\n", "timings_jip_c = copy(foo)\n", "timings_jpi_c = copy(foo)\n", "timings_pij_c = copy(foo)\n", "timings_pji_c = copy(foo)\n", "for (mnkind, mnk) in enumerate(mnks)\n", " A = rand(mnk, mnk)\n", " B = rand(mnk, mnk)\n", " C = rand(mnk, mnk)\n", " print(\"Benchmarking with $mnk-by-$mnk matrices...\"); @time begin\n", " timings_ijp[mnkind] = @belapsed gemm_ijp!($C, $A, $B)\n", " timings_ipj[mnkind] = @belapsed gemm_ipj!($C, $A, $B)\n", " timings_jip[mnkind] = @belapsed gemm_jip!($C, $A, $B)\n", " timings_jpi[mnkind] = @belapsed gemm_jpi!($C, $A, $B)\n", " timings_pij[mnkind] = @belapsed gemm_pij!($C, $A, $B)\n", " timings_pji[mnkind] = @belapsed gemm_pji!($C, $A, $B)\n", " timings_ijp_c[mnkind] = @belapsed gemm_ijp_c!($C, $A, $B)\n", " timings_ipj_c[mnkind] = @belapsed gemm_ipj_c!($C, $A, $B)\n", " timings_jip_c[mnkind] = @belapsed gemm_jip_c!($C, $A, $B)\n", " timings_jpi_c[mnkind] = @belapsed gemm_jpi_c!($C, $A, $B)\n", " timings_pij_c[mnkind] = @belapsed gemm_pij_c!($C, $A, $B)\n", " timings_pji_c[mnkind] = @belapsed gemm_pji_c!($C, $A, $B)\n", " end\n", "end;" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m\u001b[1m Resolving\u001b[22m\u001b[39m package versions...\n", "\u001b[32m\u001b[1m Updating\u001b[22m\u001b[39m `~/.julia/environments/v1.0/Project.toml`\n", "\u001b[90m [no changes]\u001b[39m\n", "\u001b[32m\u001b[1m Updating\u001b[22m\u001b[39m `~/.julia/environments/v1.0/Manifest.toml`\n", "\u001b[90m [no changes]\u001b[39m\n" ] } ], "source": [ "# Visualize benchmarks\n", "#\n", "# We will use Plots.jl to visualize the benchmark results.\n", "# Like BenchmarkTools.jl, Plots.jl is not a standard library package\n", "# and hence we need install it prior to use:\n", "Pkg.add(\"Plots\")" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "200\n", "\n", "\n", "400\n", "\n", "\n", "600\n", "\n", "\n", "800\n", "\n", "\n", "2.5\n", "\n", "\n", "5.0\n", "\n", "\n", "7.5\n", "\n", "\n", "10.0\n", "\n", "\n", "12.5\n", "\n", "\n", "matrix dimensions (m = n = k)\n", "\n", "\n", "GFLOPS from minimum sample time\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "ijp\n", "\n", "\n", "\n", "ipj\n", "\n", "\n", "\n", "jip\n", "\n", "\n", "\n", "jpi\n", "\n", "\n", "\n", "pij\n", "\n", "\n", "\n", "pji\n", "\n", "\n", "\n", "ijp_c\n", "\n", "\n", "\n", "ipj_c\n", "\n", "\n", "\n", "jip_c\n", "\n", "\n", "\n", "jpi_c\n", "\n", "\n", "\n", "pij_c\n", "\n", "\n", "\n", "pji_c\n", "\n", "\n" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Now that we have Plots.jl, we can begin with the visualizations proper\n", "using Plots\n", "gr() # this loads the GR framework as the drawing backend for Plots.jl\n", "\n", "# Let's define some nice colors from the Tableau palette\n", "tableaurgb = ((255, 158, 74), (237, 102, 93),\n", " (173, 139, 201), (114, 158, 206), (103, 191, 92), (237, 151, 202),\n", " (205, 204, 93), (168, 120, 110), (162, 162, 162), (109, 204, 218))\n", "tableaucolors = (rgb -> string(\"0x\", string.(rgb; base = 16, pad = 2)...)).(tableaurgb)\n", "\n", "timetogflops(mnk, time) = 2 * mnk^3 / time / 10^9\n", "gflops_ijp = timetogflops.(mnks, timings_ijp)\n", "gflops_ipj = timetogflops.(mnks, timings_ipj)\n", "gflops_jip = timetogflops.(mnks, timings_jip)\n", "gflops_jpi = timetogflops.(mnks, timings_jpi)\n", "gflops_pij = timetogflops.(mnks, timings_pij)\n", "gflops_pji = timetogflops.(mnks, timings_pji)\n", "gflops_ijp_c = timetogflops.(mnks, timings_ijp_c)\n", "gflops_ipj_c = timetogflops.(mnks, timings_ipj_c)\n", "gflops_jip_c = timetogflops.(mnks, timings_jip_c)\n", "gflops_jpi_c = timetogflops.(mnks, timings_jpi_c)\n", "gflops_pij_c = timetogflops.(mnks, timings_pij_c)\n", "gflops_pji_c = timetogflops.(mnks, timings_pji_c)\n", "\n", "ylims = extrema(vcat(\n", " gflops_ijp, gflops_ipj,\n", " gflops_jip, gflops_jpi,\n", " gflops_pij, gflops_pji,\n", " gflops_ijp_c, gflops_ipj_c,\n", " gflops_jip_c, gflops_jpi_c,\n", " gflops_pij_c, gflops_pji_c))\n", "foo = plot(mnks, gflops_ijp, label = \"ijp\", color = tableaucolors[2],\n", " xlabel = \"matrix dimensions (m = n = k)\",\n", " ylabel = \"GFLOPS from minimum sample time\",\n", " xlim = (first(mnks), last(mnks)),\n", " ylim = (first(ylims), last(ylims)),)\n", "plot!(mnks, gflops_ipj, label = \"ipj\", color = tableaucolors[3])\n", "plot!(mnks, gflops_jip, label = \"jip\", color = tableaucolors[4])\n", "plot!(mnks, gflops_jpi, label = \"jpi\", color = tableaucolors[5])\n", "plot!(mnks, gflops_pij, label = \"pij\", color = tableaucolors[6])\n", "plot!(mnks, gflops_pji, label = \"pji\", color = tableaucolors[7])\n", "plot!(mnks, gflops_ijp_c, label = \"ijp_c\", line = (2, :dot), color = tableaucolors[2])\n", "plot!(mnks, gflops_ipj_c, label = \"ipj_c\", line = (2, :dot), color = tableaucolors[3])\n", "plot!(mnks, gflops_jip_c, label = \"jip_c\", line = (2, :dot), color = tableaucolors[4])\n", "plot!(mnks, gflops_jpi_c, label = \"jpi_c\", line = (2, :dot), color = tableaucolors[5])\n", "plot!(mnks, gflops_pij_c, label = \"pij_c\", line = (2, :dot), color = tableaucolors[6])\n", "plot!(mnks, gflops_pji_c, label = \"pji_c\", line = (2, :dot), color = tableaucolors[7])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Julia 1.0.0", "language": "julia", "name": "julia-1.0" }, "language_info": { "file_extension": ".jl", "mimetype": "application/julia", "name": "julia", "version": "1.0.0" } }, "nbformat": 4, "nbformat_minor": 2 }