\documentclass{lug} \title{GPU Computing} \author{Sam Sartor} \institute{Mines Linux Users Group} \usepackage{etoolbox} \usepackage{array} \usepackage{amsmath} \usepackage{adjustbox} \usepackage{calc} \usepackage{lmodern} \makeatletter \patchcmd{\beamer@sectionintoc}{\vskip1.5em}{\vskip0.5em}{}{} \makeatother \newcommand{\pmidg}[1]{\parbox{\widthof{#1}}{#1}} \newcommand{\splitslide}[4]{ \noindent \begin{minipage}{#1 \textwidth - #2 } #3 \end{minipage}% \hspace{ \dimexpr #2 * 2 \relax }% \begin{minipage}{\textwidth - #1 \textwidth - #2 } #4 \end{minipage} } \begin{document} \section{The GPU} \begin{frame}{What is a GPU?} \splitslide{0.65}{.7em}{ A Graphics Processing Unit (GPU) is a specialized chip primarily for accelerating graphical calculations. \vspace{2ex} GPUs generally derive their performance from their ability to do large numbers of identical arithmetic calculations in parallel. }{ \pmidg{\includegraphics[width=\textwidth]{graphics/tesla}} } \end{frame} \begin{frame}{GPUs for Graphics} \splitslide{0.75}{.7em}{ Screens have lot of pixels that need to be calculated very quickly. All of the required calculations are identical, just with different input numbers. And because pixels are independent the calculations are also trivial to parallelize. As a result, using the unnecessarily clever CPU would be wasteful and slow. A separate pixel-optimized chip can be used instead, leaving the CPU to do the important stuff. }{ \pmidg{\includegraphics[width=\textwidth]{graphics/pixels}} } \end{frame} \begin{frame}{GPU Computing} \splitslide{0.75}{.7em}{ Coloring pixels is not the only problem that involves a large number of similar, repetitive calculations. General-purpose GPUs can be used for countless other problems including machine learning, computer vision, signal processing, statistics, linear algebra, finance, and cryptography. }{ \pmidg{\includegraphics[width=\textwidth]{graphics/resimg}} } \end{frame} \begin{frame}{History} \splitslide{0.75}{.7em}{ \emph{1970s} - Highly specialized, used only for buffering video and drawing simple 2D rasters (sprites) \emph{1980s} - Common bitmap operations such as filling simple 2D shapes \emph{1990s} - 3D triangular graphics, common interfaces (OpenGL, Direct3D) developed \emph{2000s} - General purpose GPUs, capable of executing arbitrary instructions \emph{2010s} - Highly general, used as much for supercomputing as for graphics }{ \pmidg{\includegraphics[width=\textwidth]{graphics/ibm-pc-mda}} } \end{frame} \section{How do GPUs Work?} \begin{frame}{Architecture} \includegraphics[width=\textwidth]{graphics/gpuarch} GPUs excel at repetition. Instead of performing the same calculation many times in sequence, they step though sequences of instructions all at once using several cores. Each core does the same operation at the same time, but with different inputs. \end{frame} \begin{frame}{Branching} \splitslide{0.65}{.7em}{ Unlike CPUs, which jump back and forth through a program as conditions are met, a GPU will run every possible instruction in sequence, turning different cores on and off as branching occurs. In effect, GPUs are useful for parallel computations but not for multitasking. }{ \pmidg{\includegraphics[width=\textwidth]{graphics/branching}} } \end{frame} \section{Computing At Home} \begin{frame}{OpenGL Shaders} \splitslide{0.75}{.7em}{ Although shaders are used for pixel stuff, they are still fundamentally general purpose. Use vertex attributes, uniforms, and textures as input. Use the framebuffer for output. \vspace{2ex} OpenGL bindings exist for every language under the sun. }{ \pmidg{\includegraphics[width=\textwidth]{graphics/shaderfluid}} } \end{frame} \begin{frame}{OpenGL Shaders - Pros \& Cons} \textbf{Pros} \begin{itemize} \item Shaders have been around since like 2004 \item Universally supported \item OpenGL allows for minimal setup \end{itemize} \textbf{Cons} \begin{itemize} \item Low level \item Not very general \item All data has to be stored in textures/images \end{itemize} \end{frame} \begin{frame}{CUDA} \splitslide{0.75}{.7em}{ CUDA is a computing platform and API that provides truly general GPU computing. C/C++/Fortran code can be compiled ahead of time or at runtime and sent to the GPU along with arbitrary chunks of memory. \vspace{2ex} Libraries for controlling and communicating with CUDA programs exist for many languages including C/C++ (through the CUDA SDK) and Python (PyCUDA library). }{ \pmidg{\includegraphics[width=\textwidth]{graphics/nvidia-cuda}} } \end{frame} \begin{frame}{CUDA - Pros \& Cons} \textbf{Pros} \begin{itemize} \item Get to use real C/C++ \item Pointers, recursion, etc. \item Copy arbitrary data between CPU and GPU \item Fast \end{itemize} \textbf{Cons} \begin{itemize} \item Only available on high-end Nvidia cards \item Low level \item Annoying to setup \end{itemize} \end{frame} \begin{frame}{OpenCL} \splitslide{0.75}{.7em}{ OpenCL is a cross platform alternative to CUDA. It is similar in structure to OpenGL, but intended for general-purpose computation (not just 3D graphics). \vspace{2ex} Bindings exist for all languages. I even found a Brainfuck API. }{ \pmidg{\includegraphics[width=\textwidth]{graphics/opencl-logo}} } \end{frame} \begin{frame}{OpenCL - Pros \& Cons} \textbf{Pros} \begin{itemize} \item Cross platform \item Nice API \item Will use CPU instead of GPU if needed (works anywhere) \end{itemize} \textbf{Cons} \begin{itemize} \item Must use C-like OpenCL language \item No recursion, pointers, etc. \item Slightly slower than CUDA \end{itemize} \end{frame} \begin{frame}{ArrayFire} \splitslide{0.75}{.7em}{ ArrayFire is an easy-to-use library of high-level functions with built-in implementations for CUDA, OpenCL, and the CPU. It is useful for linear algebra, statistics, trigonometry, signal processing, image processing, and more. \vspace{2ex} ArrayFire has first-party support for C++, Python, Go, Rust, Ruby, Lisp, Java, Fortran, D, R, C\#, JavaScript, and Lua. }{ \pmidg{\includegraphics[width=\textwidth]{graphics/arrayfire-logo}} } \end{frame} \begin{frame}{ArrayFire - Pros \& Cons} \textbf{Pros} \begin{itemize} \item Trivial to use \item Cross platform \item Just pass arrays to functions \end{itemize} \textbf{Cons} \begin{itemize} \item Limited library of functions \item No way of defining your own \end{itemize} \end{frame} \begin{frame}{Torch} \splitslide{0.75}{.7em}{ Torch is a popular Lua library for machine learning that seems to be used a lot. It has CPU, CUDA, and OpenCL backends available. }{ \pmidg{\includegraphics[width=\textwidth]{graphics/torch-logo}} } \end{frame} \begin{frame}{Torch - Pros \& Cons} \textbf{Pros} \begin{itemize} \item Large community \item High level API \item Fast \end{itemize} \textbf{Cons} \begin{itemize} \item Lua \end{itemize} \end{frame} \begin{frame}{TensorFlow} \splitslide{0.75}{.7em}{ TensorFlow is Google's library for moving big lists of numbers around, generally with machine learning in mind. As a result, Torch and Tensorflow are currently at war. It has a CPU implementation and a CUDA-based GPU implementation. \vspace{2ex} TensorFlow is primarily for Python, with C++ behind the scenes. }{ \pmidg{\includegraphics[width=\textwidth]{graphics/tensorflow-logo}} } \end{frame} \begin{frame}{TensorFlow - Pros \& Cons} \textbf{Pros} \begin{itemize} \item Python \item Good visualization tools \item Cool abstraction \item Best library for Recurrent Neural Networks \end{itemize} \textbf{Cons} \begin{itemize} \item Slightly slower than Torch (for now) \item Tricky to set up (CUDA) \item Needs a high-end Nvidia card to use the GPU \end{itemize} \end{frame} \end{document}