Caffe2 - C++ API
A deep learning, cross platform ML framework
reducer_functors.h
1 
2 #ifndef CAFFE2_OPERATORS_RECUDER_FUNCTORS_H_
3 #define CAFFE2_OPERATORS_RECUDER_FUNCTORS_H_
4 
5 #include <array>
6 
7 #include "caffe2/core/context.h"
8 #include "caffe2/core/tensor.h"
9 #include "caffe2/utils/math.h"
10 #include "caffe2/utils/proto_utils.h"
11 
12 namespace caffe2 {
13 
15 // Range reducers: can leverage that input segment is continuous and provide
16 // special implementation
18 
19 // Put forward and backward in the same template?
20 template <typename T, class Context>
22 template <typename T, class Context>
24 
25 template <typename T>
27  public:
28  void operator()(
29  const TIndex block_size,
30  const TIndex blocks,
31  const T* in,
32  T* out,
33  CPUContext* /*context*/) {
34  // do we need to go through wrapper in math.h?
35  EigenVectorMap<T> out_vec(out, block_size);
36  out_vec = ConstEigenMatrixMap<T>(in, block_size, blocks).rowwise().sum();
37  }
38 };
39 
40 template <typename T, class Context>
42  public:
43  void operator()(
44  const TIndex block_size,
45  const TIndex blocks,
46  const T* segment_grad,
47  T* data_grad,
48  const T* /*data_in*/, // unused
49  const T* /*data_out*/, // unused
50  Context* context) {
51  // do we have some op that does it smartly with minimum number of memcpy?
52  for (TIndex i = 0; i < blocks; ++i) {
53  context->template Copy<T, Context, Context>(
54  block_size, segment_grad, data_grad + block_size * i);
55  }
56  }
57 };
58 
60  template <typename T, class Context>
62  template <typename T, class Context>
64  static constexpr const char* name = "Sum";
65  static constexpr const char* doc =
66  "Summation is done element-wise across slices of the input tensor and "
67  "doesn't change the shape of the individual blocks.";
68 };
69 
70 // Put forward and backward in the same template?
71 template <typename T, class Context>
73 template <typename T, class Context>
75 
76 template <typename T>
78  public:
79  void operator()(
80  const TIndex block_size,
81  const TIndex blocks,
82  const T* in,
83  T* out,
84  CPUContext* /*context*/) {
85  for (int j = 0; j < block_size; ++j) {
86  T max_value = std::numeric_limits<T>::lowest();
87  for (int i = 0; i < blocks; ++i) {
88  max_value = std::max(max_value, in[i * block_size + j]);
89  }
90  T scaled_exp_sum = 0;
91  for (int i = 0; i < blocks; ++i) {
92  scaled_exp_sum += std::exp(in[i * block_size + j] - max_value);
93  }
94  *(out++) = std::log(scaled_exp_sum) + max_value;
95  }
96  }
97  T r{1};
98 };
99 
100 template <typename T, class Context>
102  public:
103  void operator()(
104  const TIndex block_size,
105  const TIndex blocks,
106  const T* segment_grad, // GO
107  T* data_grad, // GI
108  const T* data_in, // I
109  const T* data_out, // O
110  Context* /*context*/) {
111  for (int j = 0; j < block_size; ++j) {
112  const T out_grad = *(segment_grad++);
113  const T offset = *(data_out++);
114  for (int i = 0; i < blocks; ++i) {
115  auto idx = i * block_size + j;
116  data_grad[idx] = out_grad * std::exp(data_in[idx] - offset);
117  }
118  }
119  }
120 };
121 
123  template <typename T, class Context>
125  template <typename T, class Context>
127  static constexpr const char* name = "LogSumExp";
128  static constexpr const char* doc =
129  "LogSumExp computes the element-wise log of the sum of exponentials of "
130  "input slices. Operation doesn't change the shape of individual blocks.";
131 };
132 
133 template <typename T, class Context>
135 template <typename T, class Context>
137 
138 template <typename T>
140  public:
141  void operator()(
142  const TIndex block_size,
143  const TIndex blocks,
144  const T* in,
145  T* out,
146  CPUContext* /*context*/) {
147  for (int j = 0; j < block_size; ++j) {
148  T max_value = std::numeric_limits<T>::lowest();
149  for (int i = 0; i < blocks; ++i) {
150  max_value = std::max(max_value, in[i * block_size + j]);
151  }
152  T scaled_exp_sum = 0;
153  for (int i = 0; i < blocks; ++i) {
154  scaled_exp_sum += std::exp(in[i * block_size + j] - max_value);
155  }
156  scaled_exp_sum /= blocks;
157  *(out++) = std::log(scaled_exp_sum) + max_value;
158  }
159  }
160 };
161 
162 template <typename T, class Context>
164  public:
165  void operator()(
166  const TIndex block_size,
167  const TIndex blocks,
168  const T* segment_grad, // GO
169  T* data_grad, // GI
170  const T* data_in, // I
171  const T* data_out, // O
172  Context* /*context*/) {
173  for (int j = 0; j < block_size; ++j) {
174  const T out_grad = *(segment_grad++);
175  const T offset = *(data_out++);
176  for (int i = 0; i < blocks; ++i) {
177  auto idx = i * block_size + j;
178  data_grad[idx] = out_grad * std::exp(data_in[idx] - offset) / blocks;
179  }
180  }
181  }
182 };
183 
185  template <typename T, class Context>
187  template <typename T, class Context>
189  static constexpr const char* name = "LogMeanExp";
190  static constexpr const char* doc =
191  "LogMeanExp computes the element-wise log of the mean of exponentials of "
192  "input slices. Operation doesn't change the shape of individual blocks.";
193 };
194 
195 template <typename T, class Context>
197 template <typename T, class Context>
199 
200 template <typename T>
202  public:
203  void operator()(
204  const TIndex block_size,
205  const TIndex blocks,
206  const T* in,
207  T* out,
208  CPUContext* /*context*/) {
209  for (int j = 0; j < block_size; ++j) {
210  T avg_value = 0;
211  for (int i = 0; i < blocks; ++i) {
212  avg_value += in[i * block_size + j] / blocks;
213  }
214  *(out++) = avg_value;
215  }
216  }
217 };
218 
219 template <typename T, class Context>
221  public:
222  void operator()(
223  const TIndex block_size,
224  const TIndex blocks,
225  const T* segment_grad, // GO
226  T* data_grad, // GI
227  const T* /*data_in*/, // I
228  const T* /*data_out*/, // O
229  Context* /*context*/) {
230  const auto in_grad = 1.0 / blocks;
231  for (int j = 0; j < block_size; ++j) {
232  const T out_grad = *(segment_grad++);
233  for (int i = 0; i < blocks; ++i) {
234  auto idx = i * block_size + j;
235  data_grad[idx] = out_grad * in_grad;
236  }
237  }
238  }
239 };
240 
242  template <typename T, class Context>
244  template <typename T, class Context>
246  static constexpr const char* name = "Mean";
247  static constexpr const char* doc =
248  "Mean computation is done element-wise, so that each element of the "
249  "output slice corresponds to the average value of the respective "
250  "elements in the input slices. Operation doesn't change the shape of "
251  "individual blocks.";
252 };
253 
254 template <typename T, class Context>
256 template <typename T, class Context>
258 
259 template <typename T>
261  public:
262  void operator()(
263  const TIndex block_size,
264  const TIndex blocks,
265  const T* in,
266  T* out,
267  CPUContext* /*context*/) {
268  for (int j = 0; j < block_size; ++j) {
269  T max_value = std::numeric_limits<T>::lowest();
270  for (int i = 0; i < blocks; ++i) {
271  max_value = std::max(max_value, in[i * block_size + j]);
272  }
273  *(out++) = max_value;
274  }
275  }
276 };
277 
278 template <typename T, class Context>
280  public:
281  void operator()(
282  const TIndex block_size,
283  const TIndex blocks,
284  const T* segment_grad, // GO
285  T* data_grad, // GI
286  const T* data_in, // I
287  const T* data_out, // O
288  Context* /*context*/) {
289  std::memset(
290  static_cast<void*>(data_grad), 0, blocks * block_size * sizeof(T));
291  for (int j = 0; j < block_size; ++j) {
292  const T out_grad = *(segment_grad++);
293  const T out = data_out[j];
294  for (int i = 0; i < blocks; ++i) {
295  auto idx = i * block_size + j;
296  if (out == data_in[idx]) {
297  data_grad[idx] = out_grad;
298  }
299  }
300  }
301  }
302 };
303 
305  template <typename T, class Context>
307  template <typename T, class Context>
309  static constexpr const char* name = "Max";
310  static constexpr const char* doc =
311  "Max computation is done element-wise, so that each element of the "
312  "output slice corresponds to the max value of the respective "
313  "elements in the input slices. Operation doesn't change the shape of "
314  "individual blocks. This implementation imitates torch nn.Max operator. "
315  "If the maximum value occurs more than once, the operator will return "
316  "the first occurence of value. When computing the gradient using the "
317  "backward propagation, the gradient input corresponding to the first "
318  "occurence of the maximum value will be used.";
319 };
320 
322 // Incremental reducers: consume elements one by one
324 
325 // Base implementation, everything can be overwritten
326 class BaseReducer {
327  public:
328  static constexpr int kInputCount = 1;
329 
330  struct Meta {
331  TIndex block_size;
332  vector<TIndex> block_shape;
333  bool first_dim;
334 
335  explicit Meta(bool first = true) : first_dim(first) {}
336 
337  void computeMeta(const std::vector<TIndex>& dims, int skip_dims) {
338  first_dim ? block_shape.assign(dims.begin() + skip_dims, dims.end())
339  : block_shape.assign(dims.begin(), dims.end() - skip_dims);
340  block_size = first_dim ? size_from_dim_(skip_dims, dims)
341  : size_from_dim_(dims.size() - skip_dims, dims);
342  }
343 
344  void
345  observeInput(int input, const Tensor<CPUContext>& value, int skip_dims) {
346  DCHECK_EQ(0, input);
347  auto& dims = value.dims();
348  computeMeta(dims, skip_dims);
349  }
350 
351  void appendOutputShape(vector<TIndex>* output_shape) {
352  output_shape->insert(
353  output_shape->end(), block_shape.begin(), block_shape.end());
354  }
355 
356  vector<TIndex> getOutputShape(const TensorShape& in, int skip_dims) {
357  vector<TIndex> dims(in.dims().begin(), in.dims().end());
358  computeMeta(dims, skip_dims);
359  return block_shape;
360  }
361  };
362 
363  template <int FixedSize>
364  void finish(const Meta& /*meta*/, CPUContext* /*context*/) {}
365 };
366 
368  public:
369  // which of the original inputs are required for gradient computation
370  static constexpr std::array<int, 0> originalInputs() {
371  return std::array<int, 0>();
372  }
373 
374  static constexpr bool computeLength() {
375  return false;
376  }
377 
378  static int numAuxInputsWithGrads(const OperatorDef& /*def*/) {
379  return 0;
380  }
381 
382  static bool requiresDataInput(const OperatorDef& /*def*/) {
383  return false;
384  }
385 
386  // True if the backward op requires the output of the forward op.
387  static bool requiresForwardOutput() {
388  return false;
389  }
390 
391  struct Meta {
392  TIndex block_size;
393  vector<TIndex> block_shape;
394  bool first_dim;
395 
396  Meta(
397  const Tensor<CPUContext>& out_grad,
398  int skip_dims,
399  bool first_dim = true)
400  : first_dim(first_dim) {
401  auto& dims = out_grad.dims();
402  first_dim ? block_shape.assign(dims.begin() + skip_dims, dims.end())
403  : block_shape.assign(dims.begin(), dims.end() - skip_dims);
404  block_size = first_dim
405  ? out_grad.size_from_dim(skip_dims)
406  : out_grad.size_from_dim(out_grad.ndim() - skip_dims);
407  }
408 
409  void observeOriginalInput(
410  int /*original_input*/,
411  const Tensor<CPUContext>& /*value*/,
412  Tensor<CPUContext>* /*input_grad*/, // optional grad to populate
413  int /*skip_dims*/) {}
414 
415  void appendGradShape(vector<TIndex>* output_shape) {
416  output_shape->insert(
417  output_shape->end(), block_shape.begin(), block_shape.end());
418  }
419  };
420 };
421 
422 // Put forward and backward in the same template?
423 template <typename T, class Context>
425 template <typename T, class Context>
427 
428 template <typename T>
429 class SumReducer<T, CPUContext> : public BaseReducer {
430  public:
432 
433  SumReducer(const Meta& meta, T* out, CPUContext* /*context*/)
434  : current_size_(0), out_(out) {
435  // add a wrapper in Context for it
436  if (meta.first_dim) {
437  memset(out, 0, sizeof(T) * meta.block_size);
438  }
439  }
440  template <int FixedSize>
441  void process(
442  const Meta& meta,
443  const T* in,
444  TIndex /*offset*/,
445  CPUContext* context) {
446  if (meta.first_dim) {
447  math::AxpyFixedSize<T, CPUContext, FixedSize>(
448  meta.block_size, 1, in, out_, context);
449  } else {
450  math::Sum<T, CPUContext>(
451  meta.block_size, in, out_ + current_size_++, context);
452  }
453  }
454 
455  private:
456  int current_size_;
457  T* out_;
458 };
459 
460 template <typename T, class Context>
462  public:
464 
466  const Meta& /*meta*/,
467  const T* s_grad,
468  CPUContext* /*context*/)
469  : s_grad_(s_grad) {}
470 
471  template <int FixedSize>
472  void fillGrad(
473  const Meta& meta,
474  T* data_grad,
475  TIndex offset,
476  Context* context,
477  const int length) {
478  if (FixedSize == 1) { // static if
479  *data_grad = *s_grad_;
480  } else if (meta.first_dim) {
481  context->template Copy<T, Context, Context>(
482  meta.block_size, s_grad_, data_grad);
483  } else {
484  math::Set<T, Context>(length, s_grad_[offset], data_grad, context);
485  }
486  }
487 
488  private:
489  const T* s_grad_;
490 };
491 
493  template <typename T, class Context>
495  template <typename T, class Context>
497  static constexpr const char* name = "Sum";
498  static constexpr const char* doc =
499  "Summation is done element-wise across slices of the input tensor and "
500  "doesn't change the shape of the individual blocks.";
501  static void PopulateSchema(OpSchema& /*schema*/) {}
502 };
503 
504 // Put forward and backward in the same template?
505 template <typename T, class Context>
507 template <typename T, class Context>
509 
510 template <typename T>
512  public:
513  static constexpr int kInputCount = 2;
514 
516 
518  const T* scalars;
519 
520  bool first_dim;
521 
522  explicit Meta(bool first = true) : first_dim(first) {}
523 
524  void
525  observeInput(int input, const Tensor<CPUContext>& value, int skip_dims) {
526  if (input == 1) {
527  CAFFE_ENFORCE_EQ(
528  skip_dims, value.ndim(), "SCALARS mustn't have extra dimensions");
529  scalars = value.data<T>();
530  return;
531  }
532  BaseReducer::Meta::observeInput(input, value, skip_dims);
533  }
534  };
535 
536  WeightedSumReducer(const Meta& meta, T* out, CPUContext* /*context*/)
537  : out_(out) {
538  // do we have a wrapper for it?
539  memset(out, 0, sizeof(T) * meta.block_size);
540  }
541  template <int FixedSize>
542  void
543  process(const Meta& meta, const T* in, TIndex offset, CPUContext* context) {
544  CAFFE_ENFORCE(
545  meta.first_dim,
546  "WeightedSumReducer implemented only for "
547  "front dimensions reduction");
548  math::AxpyFixedSize<T, CPUContext, FixedSize>(
549  meta.block_size, meta.scalars[offset], in, out_, context);
550  }
551 
552  private:
553  T* out_;
554 };
555 
556 template <typename T, class Context>
558  public:
559  // which of the original inputs are required for gradient computation
560  static constexpr std::array<int, 1> originalInputs() {
561  return {{1}};
562  }
563 
564  static int numAuxInputsWithGrads(const OperatorDef& def) {
565  return GetFlagArgument(def, "grad_on_weights");
566  }
567 
568  static bool requiresDataInput(const OperatorDef& def) {
569  return numAuxInputsWithGrads(def) > 0;
570  }
571 
573 
574  struct Meta : public BaseReducerGradient::Meta {
575  const T* scalars;
576  T* scalars_grad;
577 
578  using BaseReducerGradient::Meta::Meta;
579 
580  void observeOriginalInput(
581  int original_input,
582  const Tensor<CPUContext>& value,
583  Tensor<CPUContext>* input_grad, // optional grad to populate
584  int /*skip_dims*/) {
585  CAFFE_ENFORCE_EQ(1, original_input);
586  scalars = value.data<T>();
587  if (input_grad) {
588  input_grad->ResizeLike(value);
589  scalars_grad = input_grad->mutable_data<T>();
590  }
591  }
592  };
593 
595  const Meta& /*meta*/,
596  const T* s_grad,
597  CPUContext* /*context*/)
598  : s_grad_(s_grad) {}
599 
600  template <int FixedSize>
601  void fillGrad(
602  const Meta& meta,
603  T* data_grad,
604  TIndex offset,
605  Context* context,
606  const int /*length*/) {
607  math::ScaleFixedSize<T, CPUContext, FixedSize>(
608  meta.block_size, meta.scalars[offset], s_grad_, data_grad, context);
609  }
610 
611  // Special version which is called with the main input too, used only if
612  // additional input grad is requested
613  template <int FixedSize>
614  void fillGradWithMainInput(
615  const Meta& meta,
616  const T* data,
617  T* data_grad,
618  TIndex offset,
619  Context* context,
620  const int /*length*/) {
621  math::ScaleFixedSize<T, CPUContext, FixedSize>(
622  meta.block_size, meta.scalars[offset], s_grad_, data_grad, context);
623  math::Dot(
624  meta.block_size, s_grad_, data, meta.scalars_grad + offset, context);
625  }
626 
627  private:
628  const T* s_grad_;
629 };
630 
632  template <typename T, class Context>
634  template <typename T, class Context>
636  static constexpr const char* name = "WeightedSum";
637  static constexpr const char* doc =
638  "Input slices are first scaled by SCALARS and then summed element-wise. "
639  "It doesn't change the shape of the individual blocks.";
640  static void PopulateSchema(OpSchema& schema) {
641  schema.Input(0, "DATA", "Input tensor for the summation");
642  schema.Input(
643  1,
644  "SCALARS",
645  "Scalar multipliers for the input slices. Must be a vector with the "
646  "length matching the number of slices");
647  schema.Arg(
648  "grad_on_weights",
649  "Produce also gradient for `weights`. For now it's only supported in "
650  "`Lengths`-based operators");
651  }
652 };
653 
654 template <typename T, class Context>
656 template <typename T, class Context>
658 
659 template <typename T>
660 class MeanReducer<T, CPUContext> : public BaseReducer {
661  public:
663 
664  MeanReducer(const Meta& meta, T* out, CPUContext* /*context*/)
665  : out_(out), current_size_(0) {
666  if (meta.first_dim) {
667  memset(out, 0, sizeof(T) * meta.block_size);
668  }
669  }
670 
671  template <int FixedSize>
672  void process(
673  const Meta& meta,
674  const T* in,
675  TIndex /*offset*/,
676  CPUContext* context) {
677  if (meta.first_dim) {
678  math::AxpyFixedSize<T, CPUContext, FixedSize>(
679  meta.block_size, 1, in, out_, context);
680  } else {
681  math::Sum<T, CPUContext>(
682  meta.block_size, in, out_ + current_size_, context);
683  }
684  current_size_++;
685  }
686 
687  template <int FixedSize>
688  void finish(const Meta& meta, CPUContext* context) {
689  if (meta.first_dim) {
690  if (current_size_ > 0) {
691  math::ScaleFixedSize<T, CPUContext, FixedSize>(
692  meta.block_size, 1.0 / current_size_, out_, out_, context);
693  }
694  } else {
695  math::ScaleFixedSize<T, CPUContext, FixedSize>(
696  current_size_, 1.0 / meta.block_size, out_, out_, context);
697  }
698  }
699 
700  private:
701  T* out_;
702  int current_size_;
703 };
704 
705 template <typename T, class Context>
707  public:
708  static constexpr bool computeLength() {
709  return true;
710  }
711 
713 
715  const Meta& /*meta*/,
716  const T* s_grad,
717  CPUContext* /*context*/)
718  : s_grad_(s_grad) {}
719 
720  template <int FixedSize>
721  void fillGrad(
722  const Meta& meta,
723  T* data_grad,
724  TIndex offset,
725  Context* context,
726  const int length) {
727  CAFFE_ENFORCE_GT(length, 0, "Segment length must be > 0");
728  if (meta.first_dim) {
729  math::ScaleFixedSize<T, CPUContext, FixedSize>(
730  meta.block_size, 1.0 / length, s_grad_, data_grad, context);
731  } else {
732  math::Set<T, CPUContext>(
733  length, s_grad_[offset] * 1.0f / length, data_grad, context);
734  }
735  }
736 
737  private:
738  const T* s_grad_;
739 };
740 
742  template <typename T, class Context>
744  template <typename T, class Context>
746  static constexpr const char* name = "Mean";
747  static constexpr const char* doc =
748  "Mean computes the element-wise mean of the input slices. "
749  "Operation doesn't change the shape of the individual blocks.";
750  static void PopulateSchema(OpSchema& /*schema*/) {}
751 };
752 
753 template <typename T, class Context>
755 template <typename T, class Context>
757 
758 template <typename T>
759 class MaxReducer<T, CPUContext> : public BaseReducer {
760  public:
762 
763  MaxReducer(const Meta& meta, T* out, CPUContext* /*context*/)
764  : out_(out), current_size_(0) {}
765 
766  template <int FixedSize>
767  void process(
768  const Meta& meta,
769  const T* in,
770  TIndex /*offset*/,
771  CPUContext* context) {
772  CAFFE_ENFORCE(
773  meta.first_dim,
774  "MaxReducer implemented only for front dimensions reduction");
775  if (current_size_ > 0) {
776  EigenVectorMap<T> output_vec(out_, meta.block_size);
777  output_vec =
778  output_vec.cwiseMax(ConstEigenVectorMap<T>(in, meta.block_size));
779  } else {
780  memcpy(out_, in, sizeof(T) * meta.block_size);
781  }
782  ++current_size_;
783  }
784 
785  private:
786  T* out_;
787  int current_size_;
788 };
789 
790 template <typename T, class Context>
792  public:
793  static bool requiresDataInput(const OperatorDef& /*def*/) {
794  return true;
795  }
796 
797  static bool requiresForwardOutput() {
798  return true;
799  }
800 
802 
804  const Meta& /*meta*/,
805  const T* s_grad,
806  CPUContext* /*context*/)
807  : s_grad_(s_grad) {}
808 
809  template <int FixedSize>
810  void fillGradWithMainInputAndForwardOutput(
811  const Meta& meta,
812  const T* data,
813  T* data_grad,
814  const T* forward_output,
815  TIndex /*offset*/,
816  Context* /*context*/,
817  const int /*length*/) {
818  for (TIndex i = 0; i < meta.block_size; ++i) {
819  data_grad[i] = data[i] == forward_output[i] ? s_grad_[i] : 0;
820  }
821  }
822 
823  private:
824  const T* s_grad_;
825 };
826 
828  template <typename T, class Context>
830  template <typename T, class Context>
832  static constexpr const char* name = "Max";
833  static constexpr const char* doc =
834  "Max computes the element-wise max of the input slices. "
835  "Operation doesn't change the shape of the individual blocks.";
836  static void PopulateSchema(OpSchema& /*schema*/) {}
837 };
838 
839 } // namespace caffe2
840 
841 #endif // CAFFE2_OPERATORS_RECUDER_FUNCTORS_H_
const T * data() const
Returns a typed pointer of the underlying storage.
Definition: tensor.h:484
A class to record the schema of an op.
The CPU Context, representing the bare minimum of what a Context class in Caffe2 should implement...
Definition: context.h:66
T * mutable_data()
Returns a typed pointer of the underlying storage.
Definition: tensor.h:578
const vector< TIndex > & dims() const
Returns the dimensions of the tensor as a vector.
Definition: tensor.h:611
TIndex size_from_dim_(int k, const vector< TIndex > &dims)
Return product of all dimensions starting from K.
Definition: tensor.h:40
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
void ResizeLike(const Tensor< OtherContext > &src_tensor)
Resize the tensor like the source tensor.
Definition: tensor.h:315
int ndim() const
Returns the number of dimensions of the data.
Definition: tensor.h:589