3 """muji.py does multi-gpu training for caffe2 with no need to change the c++ 4 side code. Everything is defined on the computation graph level. 6 Currently, here are the assumptions: we only support the following use cases: 7 - 2 gpus, where peer access is enabled between them. 8 - 4 gpus, where peer access are enabled between all of them. 9 - 8 gpus, where peer access are enabled in two groups, 10 between {1, 2, 3, 4} and {5, 6, 7, 8}. 13 from caffe2.proto
import caffe2_pb2
17 """A utility function that returns a device option protobuf of the 20 device_option = caffe2_pb2.DeviceOption()
21 device_option.device_type = caffe2_pb2.CUDA
22 device_option.cuda_gpu_id = gpu_id
27 device_option = caffe2_pb2.DeviceOption()
28 device_option.device_type = caffe2_pb2.CPU
32 def Allreduce(net, blobs, reduced_affix="_reduced", gpu_indices=None):
33 """The general Allreduce interface that reroutes the function calls. 35 if gpu_indices
is None:
36 gpu_indices = list(range(len(blobs)))
37 if len(gpu_indices) != len(blobs):
39 "gpu_indices length and blobs length mismatch: %d vs %d" %
40 (len(gpu_indices), len(blobs))
43 return Allreduce2(net, blobs, reduced_affix, gpu_indices)
45 return Allreduce4(net, blobs, reduced_affix, gpu_indices)
47 return Allreduce8(net, blobs, reduced_affix, gpu_indices)
49 return AllreduceFallback(net, blobs, reduced_affix, gpu_indices)
52 def Allreduce2(net, blobs, reduced_affix, gpu_indices):
53 """Allreduce for 2 gpus. 55 Algorithm: 0r <- 0 + 1, 1r <- 0r, where r means "reduced" 58 gpu_a, gpu_b = gpu_indices
59 a_reduced = net.Add([a, b], a + reduced_affix, device_option=OnGPU(gpu_a))
60 b_reduced = a_reduced.Copy(
63 device_option=OnGPU(gpu_b)
65 return a_reduced, b_reduced
68 def Allreduce4(net, blobs, reduced_affix, gpu_indices):
69 """Allreduce for 4 gpus. 71 Algorithm: 2 level reduction. 72 0r <- 0 + 1, 2r <- 2 + 3 78 gpu_a, gpu_b, gpu_c, gpu_d = gpu_indices
82 str(a) + reduced_affix,
83 device_option=OnGPU(gpu_a)
87 str(c) + reduced_affix,
88 device_option=OnGPU(gpu_c)
91 a_reduced = a_reduced.Add(c_reduced, a_reduced, device_option=OnGPU(gpu_a))
93 c_reduced = a_reduced.Copy([], c_reduced, device_option=OnGPU(gpu_c))
95 b_reduced = a_reduced.Copy(
97 str(b) + reduced_affix,
98 device_option=OnGPU(gpu_b)
100 d_reduced = c_reduced.Copy(
102 str(d) + reduced_affix,
103 device_option=OnGPU(gpu_d)
105 return a_reduced, b_reduced, c_reduced, d_reduced
108 def Allreduce8(net, blobs, reduced_affix, gpu_indices):
109 """Allreduce for 8 gpus. 111 Algorithm: 3 level reduction. 112 0r <- 0 + 1, 2r <- 2 + 3, 4r <- 4 + 5, 6r <- 6 + 7 113 0r <- 0r + 2r, 4r <- 4r + 6r 117 1r <- 0r, 3r <- 2r, 5r <- 4r, 7r <- 6r 121 for i
in [0, 2, 4, 6]:
122 reduced[i] = net.Add(
123 [blobs[i], blobs[i + 1]],
124 blobs[i] + reduced_affix,
125 device_option=OnGPU(gpu_indices[i])
129 reduced[i] = net.Add(
130 [reduced[i], reduced[i + 2]],
131 str(blobs[i]) + reduced_affix,
132 device_option=OnGPU(gpu_indices[i])
135 reduced_4_copy = reduced[4].Copy(
137 str(reduced[4]) +
'_copy',
138 device_option=OnGPU(gpu_indices[0])
140 reduced[0] = reduced[0].Add(
143 device_option=OnGPU(gpu_indices[0])
146 reduced[4] = reduced[0].Copy(
149 device_option=OnGPU(gpu_indices[4])
153 reduced[i] = reduced[i - 2].Copy(
156 device_option=OnGPU(gpu_indices[i])
159 for i
in [1, 3, 5, 7]:
160 reduced[i] = reduced[i - 1].Copy(
162 blobs[i] + reduced_affix,
163 device_option=OnGPU(gpu_indices[i])
168 def AllreduceFallback(net, blobs, reduced_affix, gpu_indices):
169 """A fallback option for Allreduce with no assumption on p2p. 171 Algorithm: a flat operation on gpu 0 173 0r <- 0r + i for i in gpu_indices[1:] 174 ir <- 0r for i in gpu_indices[1:] 176 reduced = [
None] * len(gpu_indices)
178 reduced[0] = net.Copy(
180 blobs[0] + reduced_affix,
181 device_option=OnGPU(gpu_indices[0])
184 temp_name = reduced[0] +
'_temp_copy' 185 for i
in range(1, len(gpu_indices)):
189 device_option=OnGPU(gpu_indices[0])
191 reduced[0] = reduced[0].Add(
194 device_option=OnGPU(gpu_indices[0])
197 for i
in range(1, len(gpu_indices)):
198 reduced[i] = net.Copy(
200 blobs[i] + reduced_affix,
201 device_option=OnGPU(gpu_indices[i])