1 #include "caffe2/core/common_gpu.h" 2 #include "caffe2/core/context_gpu.h" 3 #include "caffe2/operators/fully_connected_op.h" 9 constexpr
int kFp16CUDADevicePropMajor = 6;
11 template <
class FullyConnectedOp>
12 bool RunFullyConnectedOpOnCUDADevice(
13 const bool float16_compute,
14 FullyConnectedOp* op) {
15 if (op->Input(0).template IsType<float>()) {
16 return op->template DoRunWithType<
22 }
else if (op->Input(0).template IsType<float16>()) {
23 if (float16_compute) {
25 if (prop.major >= kFp16CUDADevicePropMajor) {
26 return op->template DoRunWithType<
33 LOG(INFO) <<
"CUDA Device does not support FP16 computation, " 34 "falling back to FP32.";
35 return op->template DoRunWithType<
43 return op->template DoRunWithType<
51 CAFFE_THROW(
"Unsupported type");
56 template <
class FullyConnectedGradientOp>
57 bool RunFullyConnectedGradientOpOnCUDADevice(
58 const bool float16_compute,
59 FullyConnectedGradientOp* op) {
60 if (op->Input(0).template IsType<float>()) {
61 return op->template DoRunWithType<
70 }
else if (op->Input(0).template IsType<float16>()) {
71 if (float16_compute) {
73 if (prop.major >= kFp16CUDADevicePropMajor) {
74 return op->template DoRunWithType<
84 LOG(INFO) <<
"CUDA Device does not support FP16 computation, " 85 "falling back to FP32.";
86 return op->template DoRunWithType<
97 return op->template DoRunWithType<
108 CAFFE_THROW(
"Unsupported type");
118 bool FullyConnectedOp<CUDAContext>::RunOnDevice() {
119 return RunFullyConnectedOpOnCUDADevice(float16_compute_,
this);
123 bool FullyConnectedOp<
126 false >::RunOnDevice() {
127 return RunFullyConnectedOpOnCUDADevice(float16_compute_,
this);
131 bool FullyConnectedGradientOp<CUDAContext>::RunOnDevice() {
132 return RunFullyConnectedGradientOpOnCUDADevice(float16_compute_,
this);
136 bool FullyConnectedGradientOp<
139 false >::RunOnDevice() {
140 return RunFullyConnectedGradientOpOnCUDADevice(float16_compute_,
this);
143 #if CUDA_VERSION >= 9000 150 bool FullyConnectedOp<CUDAContext, TensorCoreEngine>::RunOnDevice() {
151 return RunFullyConnectedOpOnCUDADevice(
false ,
this);
155 bool FullyConnectedOp<
158 false >::RunOnDevice() {
159 return RunFullyConnectedOpOnCUDADevice(
false ,
this);
163 bool FullyConnectedGradientOp<CUDAContext, TensorCoreEngine>::RunOnDevice() {
164 return RunFullyConnectedGradientOpOnCUDADevice(
169 bool FullyConnectedGradientOp<
172 false >::RunOnDevice() {
173 return RunFullyConnectedGradientOpOnCUDADevice(
179 REGISTER_CUDA_OPERATOR(FC, FullyConnectedOp<CUDAContext>);
180 REGISTER_CUDA_OPERATOR(FCGradient, FullyConnectedGradientOp<CUDAContext>);
182 REGISTER_CUDA_OPERATOR(
188 REGISTER_CUDA_OPERATOR(
189 FCTransposedGradient,
190 FullyConnectedGradientOp<
195 #if CUDA_VERSION >= 9000 196 REGISTER_CUDA_OPERATOR_WITH_ENGINE(
199 FullyConnectedOp<CUDAContext, TensorCoreEngine>);
200 REGISTER_CUDA_OPERATOR_WITH_ENGINE(
203 FullyConnectedGradientOp<CUDAContext, TensorCoreEngine>);
205 REGISTER_CUDA_OPERATOR_WITH_ENGINE(
212 REGISTER_CUDA_OPERATOR_WITH_ENGINE(
213 FCTransposedGradient,
215 FullyConnectedGradientOp<
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
const cudaDeviceProp & GetDeviceProperty(const int deviceid)
Gets the device property for the given device.