{ "cells": [ { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# Multi-GPU Computation Implementation from Scratch" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2019-07-05T23:40:12.539924Z", "start_time": "2019-07-05T23:40:07.289355Z" }, "attributes": { "classes": [], "id": "", "n": "1" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Fri Jul 5 23:40:11 2019 \n", "+-----------------------------------------------------------------------------+\n", "| NVIDIA-SMI 418.67 Driver Version: 418.67 CUDA Version: 10.1 |\n", "|-------------------------------+----------------------+----------------------+\n", "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", "|===============================+======================+======================|\n", "| 0 Tesla V100-SXM2... Off | 00000000:00:1B.0 Off | 0 |\n", "| N/A 47C P0 54W / 300W | 0MiB / 16130MiB | 0% Default |\n", "+-------------------------------+----------------------+----------------------+\n", "| 1 Tesla V100-SXM2... Off | 00000000:00:1C.0 Off | 0 |\n", "| N/A 45C P0 55W / 300W | 2005MiB / 16130MiB | 0% Default |\n", "+-------------------------------+----------------------+----------------------+\n", "| 2 Tesla V100-SXM2... Off | 00000000:00:1D.0 Off | 0 |\n", "| N/A 46C P0 58W / 300W | 0MiB / 16130MiB | 4% Default |\n", "+-------------------------------+----------------------+----------------------+\n", "| 3 Tesla V100-SXM2... Off | 00000000:00:1E.0 Off | 0 |\n", "| N/A 43C P0 40W / 300W | 11MiB / 16130MiB | 0% Default |\n", "+-------------------------------+----------------------+----------------------+\n", " \n", "+-----------------------------------------------------------------------------+\n", "| Processes: GPU Memory |\n", "| GPU PID Type Process name Usage |\n", "|=============================================================================|\n", "| 1 49887 C /home/ubuntu/miniconda3/bin/python 1001MiB |\n", "| 1 59096 C /home/ubuntu/miniconda3/bin/python 993MiB |\n", "+-----------------------------------------------------------------------------+\n" ] } ], "source": [ "%matplotlib inline\n", "import d2l\n", "from mxnet import autograd, np, npx, gluon\n", "npx.set_np()\n", "\n", "!nvidia-smi" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Initializing LeNet." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2019-07-05T23:40:12.554122Z", "start_time": "2019-07-05T23:40:12.542050Z" }, "attributes": { "classes": [], "id": "", "n": "10" } }, "outputs": [], "source": [ "# Initialize model parameters\n", "scale = 0.01\n", "W1 = np.random.normal(scale=scale, size=(20, 1, 3, 3))\n", "b1 = np.zeros(20)\n", "W2 = np.random.normal(scale=scale, size=(50, 20, 5, 5))\n", "b2 = np.zeros(50)\n", "W3 = np.random.normal(scale=scale, size=(800, 128))\n", "b3 = np.zeros(128)\n", "W4 = np.random.normal(scale=scale, size=(128, 10))\n", "b4 = np.zeros(10)\n", "params = [W1, b1, W2, b2, W3, b3, W4, b4]\n", "\n", "# Define the model\n", "def lenet(X, params):\n", " h1_conv = npx.convolution(data=X, weight=params[0], bias=params[1],\n", " kernel=(3, 3), num_filter=20)\n", " h1_activation = npx.relu(h1_conv)\n", " h1 = npx.pooling(data=h1_activation, pool_type='avg', kernel=(2, 2),\n", " stride=(2, 2))\n", " h2_conv = npx.convolution(data=h1, weight=params[2], bias=params[3],\n", " kernel=(5, 5), num_filter=50)\n", " h2_activation = npx.relu(h2_conv)\n", " h2 = npx.pooling(data=h2_activation, pool_type='avg', kernel=(2, 2),\n", " stride=(2, 2))\n", " h2 = h2.reshape((h2.shape[0], -1))\n", " h3_linear = np.dot(h2, params[4]) + params[5]\n", " h3 = npx.relu(h3_linear)\n", " y_hat = np.dot(h3, params[6]) + params[7]\n", " return y_hat\n", "\n", "# Cross-entropy loss function\n", "loss = gluon.loss.SoftmaxCrossEntropyLoss()" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Copy model parameters to a specific GPU and initializes gradients" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2019-07-05T23:40:12.559083Z", "start_time": "2019-07-05T23:40:12.555984Z" }, "attributes": { "classes": [], "id": "", "n": "12" } }, "outputs": [], "source": [ "def get_params(params, ctx):\n", " new_params = [p.copyto(ctx) for p in params]\n", " for p in new_params:\n", " p.attach_grad()\n", " return new_params" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Test" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2019-07-05T23:40:15.975756Z", "start_time": "2019-07-05T23:40:12.561391Z" }, "slideshow": { "slide_type": "-" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "b1 weight: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] @gpu(0)\n", "b1 grad: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] @gpu(0)\n" ] } ], "source": [ "new_params = get_params(params, d2l.try_gpu(0))\n", "print('b1 weight:', new_params[1])\n", "print('b1 grad:', new_params[1].grad)" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Sum the data on all GPUs and then broadcast it" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2019-07-05T23:40:15.980945Z", "start_time": "2019-07-05T23:40:15.977226Z" }, "attributes": { "classes": [], "id": "", "n": "14" }, "scrolled": true }, "outputs": [], "source": [ "def allreduce(data):\n", " for i in range(1, len(data)):\n", " data[0][:] += data[i].copyto(data[0].context)\n", " for i in range(1, len(data)):\n", " data[0].copyto(data[i])" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Test" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2019-07-05T23:40:19.358921Z", "start_time": "2019-07-05T23:40:15.982270Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "before allreduce:\n", " [[1. 1.]] @gpu(0) \n", " [[2. 2.]] @gpu(1)\n", "after allreduce:\n", " [[3. 3.]] @gpu(0) \n", " [[3. 3.]] @gpu(1)\n" ] } ], "source": [ "data = [np.ones((1, 2), ctx=d2l.try_gpu(i)) * (i + 1) for i in range(2)]\n", "print('before allreduce:\\n', data[0], '\\n', data[1])\n", "allreduce(data)\n", "print('after allreduce:\\n', data[0], '\\n', data[1])" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Split a data batch into multiple GPUs" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2019-07-05T23:40:19.367325Z", "start_time": "2019-07-05T23:40:19.360370Z" }, "attributes": { "classes": [], "id": "", "n": "8" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "input =\n", " [[ 0. 1. 2. 3. 4. 5.]\n", " [ 6. 7. 8. 9. 10. 11.]\n", " [12. 13. 14. 15. 16. 17.]\n", " [18. 19. 20. 21. 22. 23.]]\n", "output =\n", " [[ 0. 1. 2. 3. 4. 5.]\n", " [ 6. 7. 8. 9. 10. 11.]] @gpu(0) \n", " [[12. 13. 14. 15. 16. 17.]\n", " [18. 19. 20. 21. 22. 23.]] @gpu(1)\n" ] } ], "source": [ "data = np.arange(24).reshape((4, 6))\n", "ctx = d2l.try_all_gpus()\n", "splitted = gluon.utils.split_and_load(data, ctx)\n", "print('input =\\n', data)\n", "print('output =\\n', splitted[0], '\\n', splitted[1])" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "We need to split both the features and labels" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2019-07-05T23:40:19.372751Z", "start_time": "2019-07-05T23:40:19.369328Z" }, "attributes": { "classes": [], "id": "", "n": "9" } }, "outputs": [], "source": [ "def split_batch(X, y, ctx_list):\n", " \"\"\"Split X and y into multiple devices specified by ctx\"\"\"\n", " assert X.shape[0] == y.shape[0]\n", " return (gluon.utils.split_and_load(X, ctx_list),\n", " gluon.utils.split_and_load(y, ctx_list))" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Multi-GPU training on a single mini-batch" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "ExecuteTime": { "end_time": "2019-07-05T23:40:19.379649Z", "start_time": "2019-07-05T23:40:19.374330Z" }, "attributes": { "classes": [], "id": "", "n": "10" } }, "outputs": [], "source": [ "def train_batch(X, y, gpu_params, ctx_list, lr):\n", " gpu_Xs, gpu_ys = split_batch(X, y, ctx_list)\n", " with autograd.record(): # Loss is calculated separately on each GPU\n", " ls = [loss(lenet(gpu_X, gpu_W), gpu_y)\n", " for gpu_X, gpu_y, gpu_W in zip(gpu_Xs, gpu_ys, gpu_params)]\n", " for l in ls: # Back Propagation is performed separately on each GPU\n", " l.backward()\n", " # Sum all the gradients from each GPU and then broadcast them \n", " for i in range(len(gpu_params[0])):\n", " allreduce([gpu_params[c][i].grad for c in range(len(ctx_list))])\n", " # The model parameters are updated separately on each GPU\n", " for param in gpu_params:\n", " d2l.sgd(param, lr, X.shape[0]) # Here, we use a full-size batch" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Training function" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2019-07-05T23:40:19.387640Z", "start_time": "2019-07-05T23:40:19.380970Z" }, "attributes": { "classes": [], "id": "", "n": "61" } }, "outputs": [], "source": [ "def train(num_gpus, batch_size, lr):\n", " train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)\n", " ctx_list = [d2l.try_gpu(i) for i in range(num_gpus)]\n", " # Copy model parameters to num_gpus GPUs\n", " gpu_params = [get_params(params, c) for c in ctx_list]\n", " num_epochs, times, acces = 10, [], []\n", " animator = d2l.Animator('epoch', 'test acc', xlim=[1, num_epochs])\n", " timer = d2l.Timer()\n", " for epoch in range(num_epochs):\n", " timer.start()\n", " for X, y in train_iter:\n", " train_batch(X, y, gpu_params, ctx_list, lr)\n", " npx.waitall()\n", " timer.stop()\n", " animator.add(epoch+1, (d2l.evaluate_accuracy_gpu(\n", " lambda x: lenet(x, gpu_params[0]), test_iter, ctx[0]),))\n", " print('test acc: %.2f, %.1f sec/epoch on %s' % (\n", " animator.Y[0][-1], timer.avg(), ctx_list))\n" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Training with a singe GPU" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2019-07-05T23:40:43.841379Z", "start_time": "2019-07-05T23:40:19.388943Z" }, "attributes": { "classes": [], "id": "", "n": "62" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "test acc: 0.80, 1.8 sec/epoch on [gpu(0)]\n" ] }, { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n" ], "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "train(num_gpus=1, batch_size=256, lr=0.2)" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Training with two GPUs" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "ExecuteTime": { "end_time": "2019-07-05T23:41:14.571867Z", "start_time": "2019-07-05T23:40:43.843093Z" }, "attributes": { "classes": [], "id": "", "n": "13" }, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "test acc: 0.84, 2.5 sec/epoch on [gpu(0), gpu(1)]\n" ] }, { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n" ], "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "train(num_gpus=2, batch_size=256, lr=0.2)" ] } ], "metadata": { "celltoolbar": "Slideshow", "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.1" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 2 }