{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"# Multi-GPU Computation Implementation from Scratch"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-05T23:40:12.539924Z",
"start_time": "2019-07-05T23:40:07.289355Z"
},
"attributes": {
"classes": [],
"id": "",
"n": "1"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fri Jul 5 23:40:11 2019 \n",
"+-----------------------------------------------------------------------------+\n",
"| NVIDIA-SMI 418.67 Driver Version: 418.67 CUDA Version: 10.1 |\n",
"|-------------------------------+----------------------+----------------------+\n",
"| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
"| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n",
"|===============================+======================+======================|\n",
"| 0 Tesla V100-SXM2... Off | 00000000:00:1B.0 Off | 0 |\n",
"| N/A 47C P0 54W / 300W | 0MiB / 16130MiB | 0% Default |\n",
"+-------------------------------+----------------------+----------------------+\n",
"| 1 Tesla V100-SXM2... Off | 00000000:00:1C.0 Off | 0 |\n",
"| N/A 45C P0 55W / 300W | 2005MiB / 16130MiB | 0% Default |\n",
"+-------------------------------+----------------------+----------------------+\n",
"| 2 Tesla V100-SXM2... Off | 00000000:00:1D.0 Off | 0 |\n",
"| N/A 46C P0 58W / 300W | 0MiB / 16130MiB | 4% Default |\n",
"+-------------------------------+----------------------+----------------------+\n",
"| 3 Tesla V100-SXM2... Off | 00000000:00:1E.0 Off | 0 |\n",
"| N/A 43C P0 40W / 300W | 11MiB / 16130MiB | 0% Default |\n",
"+-------------------------------+----------------------+----------------------+\n",
" \n",
"+-----------------------------------------------------------------------------+\n",
"| Processes: GPU Memory |\n",
"| GPU PID Type Process name Usage |\n",
"|=============================================================================|\n",
"| 1 49887 C /home/ubuntu/miniconda3/bin/python 1001MiB |\n",
"| 1 59096 C /home/ubuntu/miniconda3/bin/python 993MiB |\n",
"+-----------------------------------------------------------------------------+\n"
]
}
],
"source": [
"%matplotlib inline\n",
"import d2l\n",
"from mxnet import autograd, np, npx, gluon\n",
"npx.set_np()\n",
"\n",
"!nvidia-smi"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"Initializing LeNet."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-05T23:40:12.554122Z",
"start_time": "2019-07-05T23:40:12.542050Z"
},
"attributes": {
"classes": [],
"id": "",
"n": "10"
}
},
"outputs": [],
"source": [
"# Initialize model parameters\n",
"scale = 0.01\n",
"W1 = np.random.normal(scale=scale, size=(20, 1, 3, 3))\n",
"b1 = np.zeros(20)\n",
"W2 = np.random.normal(scale=scale, size=(50, 20, 5, 5))\n",
"b2 = np.zeros(50)\n",
"W3 = np.random.normal(scale=scale, size=(800, 128))\n",
"b3 = np.zeros(128)\n",
"W4 = np.random.normal(scale=scale, size=(128, 10))\n",
"b4 = np.zeros(10)\n",
"params = [W1, b1, W2, b2, W3, b3, W4, b4]\n",
"\n",
"# Define the model\n",
"def lenet(X, params):\n",
" h1_conv = npx.convolution(data=X, weight=params[0], bias=params[1],\n",
" kernel=(3, 3), num_filter=20)\n",
" h1_activation = npx.relu(h1_conv)\n",
" h1 = npx.pooling(data=h1_activation, pool_type='avg', kernel=(2, 2),\n",
" stride=(2, 2))\n",
" h2_conv = npx.convolution(data=h1, weight=params[2], bias=params[3],\n",
" kernel=(5, 5), num_filter=50)\n",
" h2_activation = npx.relu(h2_conv)\n",
" h2 = npx.pooling(data=h2_activation, pool_type='avg', kernel=(2, 2),\n",
" stride=(2, 2))\n",
" h2 = h2.reshape((h2.shape[0], -1))\n",
" h3_linear = np.dot(h2, params[4]) + params[5]\n",
" h3 = npx.relu(h3_linear)\n",
" y_hat = np.dot(h3, params[6]) + params[7]\n",
" return y_hat\n",
"\n",
"# Cross-entropy loss function\n",
"loss = gluon.loss.SoftmaxCrossEntropyLoss()"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"Copy model parameters to a specific GPU and initializes gradients"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-05T23:40:12.559083Z",
"start_time": "2019-07-05T23:40:12.555984Z"
},
"attributes": {
"classes": [],
"id": "",
"n": "12"
}
},
"outputs": [],
"source": [
"def get_params(params, ctx):\n",
" new_params = [p.copyto(ctx) for p in params]\n",
" for p in new_params:\n",
" p.attach_grad()\n",
" return new_params"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"Test"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-05T23:40:15.975756Z",
"start_time": "2019-07-05T23:40:12.561391Z"
},
"slideshow": {
"slide_type": "-"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"b1 weight: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] @gpu(0)\n",
"b1 grad: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] @gpu(0)\n"
]
}
],
"source": [
"new_params = get_params(params, d2l.try_gpu(0))\n",
"print('b1 weight:', new_params[1])\n",
"print('b1 grad:', new_params[1].grad)"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"Sum the data on all GPUs and then broadcast it"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-05T23:40:15.980945Z",
"start_time": "2019-07-05T23:40:15.977226Z"
},
"attributes": {
"classes": [],
"id": "",
"n": "14"
},
"scrolled": true
},
"outputs": [],
"source": [
"def allreduce(data):\n",
" for i in range(1, len(data)):\n",
" data[0][:] += data[i].copyto(data[0].context)\n",
" for i in range(1, len(data)):\n",
" data[0].copyto(data[i])"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"Test"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-05T23:40:19.358921Z",
"start_time": "2019-07-05T23:40:15.982270Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"before allreduce:\n",
" [[1. 1.]] @gpu(0) \n",
" [[2. 2.]] @gpu(1)\n",
"after allreduce:\n",
" [[3. 3.]] @gpu(0) \n",
" [[3. 3.]] @gpu(1)\n"
]
}
],
"source": [
"data = [np.ones((1, 2), ctx=d2l.try_gpu(i)) * (i + 1) for i in range(2)]\n",
"print('before allreduce:\\n', data[0], '\\n', data[1])\n",
"allreduce(data)\n",
"print('after allreduce:\\n', data[0], '\\n', data[1])"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"Split a data batch into multiple GPUs"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-05T23:40:19.367325Z",
"start_time": "2019-07-05T23:40:19.360370Z"
},
"attributes": {
"classes": [],
"id": "",
"n": "8"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"input =\n",
" [[ 0. 1. 2. 3. 4. 5.]\n",
" [ 6. 7. 8. 9. 10. 11.]\n",
" [12. 13. 14. 15. 16. 17.]\n",
" [18. 19. 20. 21. 22. 23.]]\n",
"output =\n",
" [[ 0. 1. 2. 3. 4. 5.]\n",
" [ 6. 7. 8. 9. 10. 11.]] @gpu(0) \n",
" [[12. 13. 14. 15. 16. 17.]\n",
" [18. 19. 20. 21. 22. 23.]] @gpu(1)\n"
]
}
],
"source": [
"data = np.arange(24).reshape((4, 6))\n",
"ctx = d2l.try_all_gpus()\n",
"splitted = gluon.utils.split_and_load(data, ctx)\n",
"print('input =\\n', data)\n",
"print('output =\\n', splitted[0], '\\n', splitted[1])"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"We need to split both the features and labels"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-05T23:40:19.372751Z",
"start_time": "2019-07-05T23:40:19.369328Z"
},
"attributes": {
"classes": [],
"id": "",
"n": "9"
}
},
"outputs": [],
"source": [
"def split_batch(X, y, ctx_list):\n",
" \"\"\"Split X and y into multiple devices specified by ctx\"\"\"\n",
" assert X.shape[0] == y.shape[0]\n",
" return (gluon.utils.split_and_load(X, ctx_list),\n",
" gluon.utils.split_and_load(y, ctx_list))"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"Multi-GPU training on a single mini-batch"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-05T23:40:19.379649Z",
"start_time": "2019-07-05T23:40:19.374330Z"
},
"attributes": {
"classes": [],
"id": "",
"n": "10"
}
},
"outputs": [],
"source": [
"def train_batch(X, y, gpu_params, ctx_list, lr):\n",
" gpu_Xs, gpu_ys = split_batch(X, y, ctx_list)\n",
" with autograd.record(): # Loss is calculated separately on each GPU\n",
" ls = [loss(lenet(gpu_X, gpu_W), gpu_y)\n",
" for gpu_X, gpu_y, gpu_W in zip(gpu_Xs, gpu_ys, gpu_params)]\n",
" for l in ls: # Back Propagation is performed separately on each GPU\n",
" l.backward()\n",
" # Sum all the gradients from each GPU and then broadcast them \n",
" for i in range(len(gpu_params[0])):\n",
" allreduce([gpu_params[c][i].grad for c in range(len(ctx_list))])\n",
" # The model parameters are updated separately on each GPU\n",
" for param in gpu_params:\n",
" d2l.sgd(param, lr, X.shape[0]) # Here, we use a full-size batch"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"Training function"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-05T23:40:19.387640Z",
"start_time": "2019-07-05T23:40:19.380970Z"
},
"attributes": {
"classes": [],
"id": "",
"n": "61"
}
},
"outputs": [],
"source": [
"def train(num_gpus, batch_size, lr):\n",
" train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)\n",
" ctx_list = [d2l.try_gpu(i) for i in range(num_gpus)]\n",
" # Copy model parameters to num_gpus GPUs\n",
" gpu_params = [get_params(params, c) for c in ctx_list]\n",
" num_epochs, times, acces = 10, [], []\n",
" animator = d2l.Animator('epoch', 'test acc', xlim=[1, num_epochs])\n",
" timer = d2l.Timer()\n",
" for epoch in range(num_epochs):\n",
" timer.start()\n",
" for X, y in train_iter:\n",
" train_batch(X, y, gpu_params, ctx_list, lr)\n",
" npx.waitall()\n",
" timer.stop()\n",
" animator.add(epoch+1, (d2l.evaluate_accuracy_gpu(\n",
" lambda x: lenet(x, gpu_params[0]), test_iter, ctx[0]),))\n",
" print('test acc: %.2f, %.1f sec/epoch on %s' % (\n",
" animator.Y[0][-1], timer.avg(), ctx_list))\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"Training with a singe GPU"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"ExecuteTime": {
"end_time": "2019-07-05T23:40:43.841379Z",
"start_time": "2019-07-05T23:40:19.388943Z"
},
"attributes": {
"classes": [],
"id": "",
"n": "62"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"test acc: 0.80, 1.8 sec/epoch on [gpu(0)]\n"
]
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
"