Caffe2 - C++ API
A deep learning, cross platform ML framework
fully_connected_op_gpu.cc
1 #include "caffe2/core/common_gpu.h"
2 #include "caffe2/core/context_gpu.h"
3 #include "caffe2/operators/fully_connected_op.h"
4 
5 namespace caffe2 {
6 
7 namespace {
8 
9 constexpr int kFp16CUDADevicePropMajor = 6;
10 
11 template <class FullyConnectedOp>
12 bool RunFullyConnectedOpOnCUDADevice(
13  const bool float16_compute,
14  FullyConnectedOp* op) {
15  if (op->Input(0).template IsType<float>()) {
16  return op->template DoRunWithType<
17  float, // X
18  float, // W
19  float, // B
20  float, // Y
21  float>(); // Math
22  } else if (op->Input(0).template IsType<float16>()) {
23  if (float16_compute) {
24  const cudaDeviceProp& prop = GetDeviceProperty(0);
25  if (prop.major >= kFp16CUDADevicePropMajor) {
26  return op->template DoRunWithType<
27  float16, // X
28  float16, // W
29  float16, // B
30  float16, // Y
31  float16>(); // Math
32  } else {
33  LOG(INFO) << "CUDA Device does not support FP16 computation, "
34  "falling back to FP32.";
35  return op->template DoRunWithType<
36  float16, // X
37  float16, // W
38  float16, // B
39  float16, // Y
40  float>(); // Math
41  }
42  } else {
43  return op->template DoRunWithType<
44  float16, // X
45  float16, // W
46  float16, // B
47  float16, // Y
48  float>(); // Math
49  }
50  } else {
51  CAFFE_THROW("Unsupported type");
52  }
53  return false;
54 }
55 
56 template <class FullyConnectedGradientOp>
57 bool RunFullyConnectedGradientOpOnCUDADevice(
58  const bool float16_compute,
59  FullyConnectedGradientOp* op) {
60  if (op->Input(0).template IsType<float>()) {
61  return op->template DoRunWithType<
62  float, // X
63  float, // W
64  float, // dY
65  float, // B
66  float, // dX
67  float, // dW
68  float, // dB
69  float>(); // Math
70  } else if (op->Input(0).template IsType<float16>()) {
71  if (float16_compute) {
72  const cudaDeviceProp& prop = GetDeviceProperty(0);
73  if (prop.major >= kFp16CUDADevicePropMajor) {
74  return op->template DoRunWithType<
75  float16, // X
76  float16, // W
77  float16, // dY
78  float16, // B
79  float16, // dX
80  float16, // dW
81  float16, // dB
82  float16>(); // Math
83  } else {
84  LOG(INFO) << "CUDA Device does not support FP16 computation, "
85  "falling back to FP32.";
86  return op->template DoRunWithType<
87  float16, // X
88  float16, // W
89  float16, // dY
90  float16, // B
91  float16, // dX
92  float16, // dW
93  float16, // dB
94  float>(); // Math
95  }
96  } else {
97  return op->template DoRunWithType<
98  float16, // X
99  float16, // W
100  float16, // dY
101  float16, // B
102  float16, // dX
103  float16, // dW
104  float16, // dB
105  float>(); // Math
106  }
107  } else {
108  CAFFE_THROW("Unsupported type");
109  }
110  return false;
111 }
112 
113 } // namespace
114 
115 // The RunFullyConnectedOpOnCUDADevice Function will use the pointer of current
116 // op and the DoRunWithType will make sure to run the correct things.
117 template <>
118 bool FullyConnectedOp<CUDAContext>::RunOnDevice() {
119  return RunFullyConnectedOpOnCUDADevice(float16_compute_, this);
120 }
121 
122 template <>
123 bool FullyConnectedOp<
124  CUDAContext,
125  DefaultEngine,
126  false /* don't transpose weight */>::RunOnDevice() {
127  return RunFullyConnectedOpOnCUDADevice(float16_compute_, this);
128 }
129 
130 template <>
131 bool FullyConnectedGradientOp<CUDAContext>::RunOnDevice() {
132  return RunFullyConnectedGradientOpOnCUDADevice(float16_compute_, this);
133 }
134 
135 template <>
136 bool FullyConnectedGradientOp<
137  CUDAContext,
138  DefaultEngine,
139  false /* don't transpose weight */>::RunOnDevice() {
140  return RunFullyConnectedGradientOpOnCUDADevice(float16_compute_, this);
141 }
142 
143 #if CUDA_VERSION >= 9000
144 
145 // Require these to be defined otherwise TensorCore FC ops will end
146 // up calling the default FC implementation which doesn't have
147 // fp16 support...
148 
149 template <>
150 bool FullyConnectedOp<CUDAContext, TensorCoreEngine>::RunOnDevice() {
151  return RunFullyConnectedOpOnCUDADevice(false /* float16_compute */, this);
152 }
153 
154 template <>
155 bool FullyConnectedOp<
156  CUDAContext,
157  TensorCoreEngine,
158  false /* don't transpose weight */>::RunOnDevice() {
159  return RunFullyConnectedOpOnCUDADevice(false /* float16_compute */, this);
160 }
161 
162 template <>
163 bool FullyConnectedGradientOp<CUDAContext, TensorCoreEngine>::RunOnDevice() {
164  return RunFullyConnectedGradientOpOnCUDADevice(
165  false /* float16_compute */, this);
166 }
167 
168 template <>
169 bool FullyConnectedGradientOp<
170  CUDAContext,
171  TensorCoreEngine,
172  false /* don't transpose weight */>::RunOnDevice() {
173  return RunFullyConnectedGradientOpOnCUDADevice(
174  false /* float16_compute */, this);
175 }
176 
177 #endif
178 
179 REGISTER_CUDA_OPERATOR(FC, FullyConnectedOp<CUDAContext>);
180 REGISTER_CUDA_OPERATOR(FCGradient, FullyConnectedGradientOp<CUDAContext>);
181 
182 REGISTER_CUDA_OPERATOR(
183  FCTransposed,
184  FullyConnectedOp<
185  CUDAContext,
186  DefaultEngine,
187  false /* don't transpose weight */>);
188 REGISTER_CUDA_OPERATOR(
189  FCTransposedGradient,
190  FullyConnectedGradientOp<
191  CUDAContext,
192  DefaultEngine,
193  false /* don't transpose weight */>);
194 
195 #if CUDA_VERSION >= 9000
196 REGISTER_CUDA_OPERATOR_WITH_ENGINE(
197  FC,
198  TENSORCORE,
199  FullyConnectedOp<CUDAContext, TensorCoreEngine>);
200 REGISTER_CUDA_OPERATOR_WITH_ENGINE(
201  FCGradient,
202  TENSORCORE,
203  FullyConnectedGradientOp<CUDAContext, TensorCoreEngine>);
204 
205 REGISTER_CUDA_OPERATOR_WITH_ENGINE(
206  FCTransposed,
207  TENSORCORE,
208  FullyConnectedOp<
209  CUDAContext,
210  TensorCoreEngine,
211  false /* don't transpose weight */>);
212 REGISTER_CUDA_OPERATOR_WITH_ENGINE(
213  FCTransposedGradient,
214  TENSORCORE,
215  FullyConnectedGradientOp<
216  CUDAContext,
217  TensorCoreEngine,
218  false /* don't transpose weight */>);
219 #endif
220 
221 } // namespace caffe2
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
const cudaDeviceProp & GetDeviceProperty(const int deviceid)
Gets the device property for the given device.
Definition: common_gpu.cc:166