Caffe2 - C++ API
A deep learning, cross platform ML framework
transpose_op_cudnn.cc
1 #include "caffe2/core/context_gpu.h"
2 #include "caffe2/core/cudnn_wrappers.h"
3 #include "caffe2/core/types.h"
4 #include "caffe2/operators/transpose_op.h"
5 #include "caffe2/utils/math.h"
6 
7 namespace caffe2 {
8 
9 #define MAX_DIMS 8
10 
11 class CuDNNTransposeOp final : public Operator<CUDAContext> {
12  public:
13  USE_OPERATOR_FUNCTIONS(CUDAContext);
14  USE_DISPATCH_HELPER;
15 
16  CuDNNTransposeOp(const OperatorDef& operator_def, Workspace* ws)
17  : Operator<CUDAContext>(operator_def, ws),
18  cudnn_wrapper_(&context_),
19  axes_(OperatorBase::GetRepeatedArgument<int>("axes")) {
20  // We will check the legality of axes_: it should be from 0 to axes_.size().
21  std::vector<int> axes_sorted(axes_);
22  std::sort(axes_sorted.begin(), axes_sorted.end());
23  for (int i = 0; i < axes_sorted.size(); ++i) {
24  if (axes_sorted[i] != i) {
25  CAFFE_THROW("Axes should be a permutation of 0 to ndim.");
26  }
27  }
28 
29  CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&xDesc_));
30  CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&yDesc_));
31  }
32 
33  ~CuDNNTransposeOp() {
34  CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(xDesc_));
35  CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(yDesc_));
36  }
37 
38  bool RunOnDevice() override {
39  const auto& X = Input(0);
40  auto* Y = Output(0);
41  const int num_axes = X.ndim();
42  const std::vector<int> x_dims(X.dims().cbegin(), X.dims().cend());
43  std::vector<int> y_dims(num_axes);
44  if (axes_.empty()) {
45  axes_.resize(num_axes);
46  for (int i = 0; i < num_axes; ++i) {
47  axes_[i] = num_axes - 1 - i;
48  }
49  y_dims.assign(X.dims().rbegin(), X.dims().rend());
50  } else {
51  CAFFE_ENFORCE_EQ(X.ndim(), axes_.size());
52  for (int i = 0; i < num_axes; ++i) {
53  y_dims[i] = X.dim32(axes_[i]);
54  }
55  }
56  Y->Resize(y_dims);
57  SetDeviceTensor(x_dims, &x_dims_device_);
58  SetDeviceTensor(y_dims, &y_dims_device_);
59  SetDeviceTensor(axes_, &axes_device_);
60  // Do the actual transpose, which is implemented in DoRunWithType().
61 #if CUDNN_VERSION_MIN(6, 0, 0)
62  return DispatchHelper<TensorTypes<float, int>>::call(this, Input(0));
63 #else
64  // CUDNN 5.1 does not have int support yet.
65  return DispatchHelper<TensorTypes<float>>::call(this, Input(0));
66 #endif
67  }
68 
69  protected:
70  void SetDeviceTensor(
71  const std::vector<int>& data,
72  Tensor<CUDAContext>* tensor) {
73  tensor->Resize(data.size());
74  context_.template Copy<int, CPUContext, CUDAContext>(
75  data.size(), data.data(), tensor->template mutable_data<int>());
76  }
77 
78  template <typename T>
79  bool DoRunWithType() {
80  const auto& input = Input(0);
81  auto* output = Output(0);
82  int ndim = input.ndim();
83 
84  if (ndim == 0) {
85  return true;
86  }
87  if (ndim == 1) {
88  output->CopyFrom(input);
89  return true;
90  }
91 
92  cudnnDataType_t typedesc = cudnnTypeWrapper<T>::type;
93 #if CUDNN_VERSION_MIN(6, 0, 0)
94  if (typedesc == CUDNN_DATA_INT32) {
95  // CUDNN Transpose only support float for now
96  math::Transpose<int, CUDAContext>(
97  axes_.size(),
98  x_dims_device_.template data<int>(),
99  y_dims_device_.template data<int>(),
100  axes_device_.template data<int>(),
101  input.size(),
102  input.template data<int>(),
103  output->template mutable_data<int>(),
104  &context_);
105  return true;
106  }
107 #endif
108 
109  CAFFE_ENFORCE(ndim < MAX_DIMS, "Input ndim exceeds compile time max.");
110 
111  stride_y[ndim - 1] = 1;
112  for (int i = ndim - 2; i >= 0; i--) {
113  stride_y[i] = stride_y[i + 1] * output->dim32(i + 1);
114  }
115 
116  CHECK(axes_.size() >= ndim);
117 
118  stride_x[ndim] = 1;
119  for (int i = 0; i < ndim; i++) {
120  stride_x[i] = 1;
121  for (int j = axes_[i] + 1; j < ndim; j++) {
122  stride_x[i] *= input.dim32(j);
123  }
124  dim_y_int[i] = output->dim32(i);
125  }
126 
127  // CuDNN requires at least 3-dim tensors
128  for (int i = ndim; i < MAX_DIMS; i++) {
129  stride_x[i] = 1;
130  stride_y[i] = 1;
131  dim_y_int[i] = 1;
132  }
133 
134  CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
135  xDesc_, typedesc, ndim < 4 ? 4 : ndim, dim_y_int, stride_x));
136 
137  CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
138  yDesc_, typedesc, ndim < 4 ? 4 : ndim, dim_y_int, stride_y));
139 
140  CUDNN_ENFORCE(cudnnTransformTensor(
141  cudnn_wrapper_.inline_cudnn_handle(),
143  xDesc_,
144  static_cast<const void*>(input.template data<T>()),
146  yDesc_,
147  static_cast<void*>(output->template mutable_data<T>())));
148  return true;
149  }
150 
151  int stride_x[MAX_DIMS];
152  int stride_y[MAX_DIMS];
153  int dim_y_int[MAX_DIMS];
154 
155  cudnnTensorDescriptor_t xDesc_;
156  cudnnTensorDescriptor_t yDesc_;
157  CuDNNWrapper cudnn_wrapper_;
158 
159  std::vector<int> axes_;
160 
161  Tensor<CUDAContext> x_dims_device_;
162  Tensor<CUDAContext> y_dims_device_;
163  Tensor<CUDAContext> axes_device_;
164 };
165 
166 REGISTER_CUDNN_OPERATOR(Transpose, CuDNNTransposeOp);
167 
168 } // namespace caffe2
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:47
void Resize(Ts...dim_source)
Resizes a tensor.
Definition: tensor.h:288
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
CuDNNWrapper is a class that wraps the cudnn handles and cudnn workspaces.
cudnnHandle_t inline_cudnn_handle()
Returns the inline cudnn handle that executes on the current thread&#39;s cuda_stream.
cudnnTypeWrapper is a wrapper class that allows us to refer to the cudnn type in a template function...
Definition: common_cudnn.h:111