1 #include "caffe2/core/context_gpu.h" 2 #include "caffe2/core/cudnn_wrappers.h" 3 #include "caffe2/core/types.h" 4 #include "caffe2/operators/transpose_op.h" 5 #include "caffe2/utils/math.h" 18 cudnn_wrapper_(&context_),
19 axes_(OperatorBase::GetRepeatedArgument<int>(
"axes")) {
21 std::vector<int> axes_sorted(axes_);
22 std::sort(axes_sorted.begin(), axes_sorted.end());
23 for (
int i = 0; i < axes_sorted.size(); ++i) {
24 if (axes_sorted[i] != i) {
25 CAFFE_THROW(
"Axes should be a permutation of 0 to ndim.");
29 CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&xDesc_));
30 CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&yDesc_));
34 CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(xDesc_));
35 CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(yDesc_));
38 bool RunOnDevice()
override {
39 const auto& X = Input(0);
41 const int num_axes = X.ndim();
42 const std::vector<int> x_dims(X.dims().cbegin(), X.dims().cend());
43 std::vector<int> y_dims(num_axes);
45 axes_.resize(num_axes);
46 for (
int i = 0; i < num_axes; ++i) {
47 axes_[i] = num_axes - 1 - i;
49 y_dims.assign(X.dims().rbegin(), X.dims().rend());
51 CAFFE_ENFORCE_EQ(X.ndim(), axes_.size());
52 for (
int i = 0; i < num_axes; ++i) {
53 y_dims[i] = X.dim32(axes_[i]);
57 SetDeviceTensor(x_dims, &x_dims_device_);
58 SetDeviceTensor(y_dims, &y_dims_device_);
59 SetDeviceTensor(axes_, &axes_device_);
61 #if CUDNN_VERSION_MIN(6, 0, 0) 71 const std::vector<int>& data,
73 tensor->
Resize(data.size());
74 context_.template Copy<int, CPUContext, CUDAContext>(
75 data.size(), data.data(), tensor->template mutable_data<int>());
79 bool DoRunWithType() {
80 const auto& input = Input(0);
81 auto* output = Output(0);
82 int ndim = input.ndim();
88 output->CopyFrom(input);
93 #if CUDNN_VERSION_MIN(6, 0, 0) 94 if (typedesc == CUDNN_DATA_INT32) {
96 math::Transpose<int, CUDAContext>(
98 x_dims_device_.template data<int>(),
99 y_dims_device_.template data<int>(),
100 axes_device_.template data<int>(),
102 input.template data<int>(),
103 output->template mutable_data<int>(),
109 CAFFE_ENFORCE(ndim < MAX_DIMS,
"Input ndim exceeds compile time max.");
111 stride_y[ndim - 1] = 1;
112 for (
int i = ndim - 2; i >= 0; i--) {
113 stride_y[i] = stride_y[i + 1] * output->dim32(i + 1);
116 CHECK(axes_.size() >= ndim);
119 for (
int i = 0; i < ndim; i++) {
121 for (
int j = axes_[i] + 1; j < ndim; j++) {
122 stride_x[i] *= input.dim32(j);
124 dim_y_int[i] = output->dim32(i);
128 for (
int i = ndim; i < MAX_DIMS; i++) {
134 CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
135 xDesc_, typedesc, ndim < 4 ? 4 : ndim, dim_y_int, stride_x));
137 CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
138 yDesc_, typedesc, ndim < 4 ? 4 : ndim, dim_y_int, stride_y));
140 CUDNN_ENFORCE(cudnnTransformTensor(
144 static_cast<const void*
>(input.template data<T>()),
147 static_cast<void*
>(output->template mutable_data<T>())));
151 int stride_x[MAX_DIMS];
152 int stride_y[MAX_DIMS];
153 int dim_y_int[MAX_DIMS];
155 cudnnTensorDescriptor_t xDesc_;
156 cudnnTensorDescriptor_t yDesc_;
159 std::vector<int> axes_;
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
void Resize(Ts...dim_source)
Resizes a tensor.
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
CuDNNWrapper is a class that wraps the cudnn handles and cudnn workspaces.
cudnnHandle_t inline_cudnn_handle()
Returns the inline cudnn handle that executes on the current thread's cuda_stream.
cudnnTypeWrapper is a wrapper class that allows us to refer to the cudnn type in a template function...