Caffe2 - C++ API
A deep learning, cross platform ML framework
segment_reduction_op.h
1 #ifndef CAFFE2_OPERATORS_SEGMENT_REDUCTION_OP_H_
2 #define CAFFE2_OPERATORS_SEGMENT_REDUCTION_OP_H_
3 
4 #include "caffe2/core/context.h"
5 #include "caffe2/core/logging.h"
6 #include "caffe2/core/operator.h"
7 #include "caffe2/operators/reducer_functors.h"
8 
9 namespace caffe2 {
10 
11 template <typename TData>
13  public:
15 
16  bool observeInput(const Tensor<CPUContext>& dataInput) {
17  data_ = dataInput.raw_data();
18  return dataInput.template IsType<TData>();
19  }
20 
21  inline const TData*
22  getBlockPtr(TIndex in_block_size, TIndex idx, TIndex /* blocks */ = 1) {
23  return static_cast<const TData*>(data_) + in_block_size * idx;
24  }
25 
26  protected:
27  const void* data_ = nullptr;
28 };
29 
31 // Range reducer ops: leverage that input segment is continuous and allow
32 // reducer functors to do something special
33 // Note: for now there are no real use cases for it yet :)
34 // Also, doesn't support additional arguments for now
36 
43 template <
44  typename T,
45  typename SIndex,
46  class Context,
47  class RangeReducer,
48  class InputAccessor = BaseInputAccessor<T>>
49 class AbstractSortedSegmentRangeOp : public Operator<Context> {
50  public:
51  USE_OPERATOR_CONTEXT_FUNCTIONS;
52  USE_SIMPLE_CTOR_DTOR(AbstractSortedSegmentRangeOp);
53 
54  bool RunOnDevice() override {
55  auto& dataInput = Input(DATA);
56  auto& segment_ids = Input(SEGMENT_IDS);
57  auto* output = Output(0);
58 
59  CAFFE_ENFORCE_EQ(1, segment_ids.ndim(), "SEGMENT_IDS must be a vector");
60  auto N = segment_ids.dim(0);
61  CAFFE_ENFORCE_EQ(
62  N,
63  dataInput.dim(0),
64  "SEGMENT_IDS must have the same length as outer dimension of DATA");
65 
66  OPERATOR_NEEDS_FEATURE(
67  inputAccessor_.observeInput(dataInput),
68  "Unsupported input type: ",
69  dataInput.meta().name(),
70  ".");
71 
72  const SIndex* s_ids = segment_ids.template data<SIndex>();
73 
74  const SIndex K = N > 0 ? s_ids[N - 1] + 1 : 0;
75  auto shape = dataInput.dims();
76  shape[0] = K;
77  output->Resize(shape);
78 
79  T* out = output->template mutable_data<T>();
80 
81  if (N == 0) {
82  return true;
83  }
84 
85  TIndex block_size = dataInput.size() / N;
86 
87  // Assume the segments are sorted and there are no gaps
88  CAFFE_ENFORCE_EQ(0, s_ids[0], "Indices must be sorted and not have gaps");
89  for (TIndex i = 0; i < N;) {
90  TIndex start = i;
91  for (++i; i < N && s_ids[start] == s_ids[i]; ++i)
92  ;
93 
94  RangeReducer()(
95  block_size,
96  i - start,
97  inputAccessor_.getBlockPtr(block_size, start, i - start),
98  out + block_size * s_ids[start],
99  &context_);
100 
101  // check correctness of the next segment
102  if (i < N) {
103  CAFFE_ENFORCE_EQ(
104  s_ids[start] + 1,
105  s_ids[i],
106  "Indices must be sorted and not have gaps");
107  }
108  }
109  return true;
110  }
111 
112  static constexpr int kNumInputs = 2;
113  INPUT_TAGS(DATA, SEGMENT_IDS);
114 
115  private:
116  InputAccessor inputAccessor_;
117 };
118 
119 template <
120  typename T,
121  typename SIndex,
122  class Context,
123  class RangeReducerGradient>
125  public:
126  USE_OPERATOR_CONTEXT_FUNCTIONS;
127  USE_SIMPLE_CTOR_DTOR(AbstractSortedSegmentRangeGradientOp);
128 
129  bool RunOnDevice() override {
130  // TODO(azzolini): avoid using input/output if not used by a particular op
131  auto& data_in = Input(DATA_IN);
132  auto& data_out = Input(DATA_OUT);
133  auto& segment_grads = Input(SEGMENT_GRADS);
134  auto& segment_ids = Input(SEGMENT_IDS);
135  auto* data_grads = Output(0);
136 
137  CAFFE_ENFORCE_EQ(1, segment_ids.ndim(), "SEGMENT_IDS must be a vector");
138  TIndex N = segment_ids.dim(0);
139 
140  const SIndex* s_ids = segment_ids.template data<SIndex>();
141  const T* s_grads = segment_grads.template data<T>();
142  const T* d_in = data_in.template data<T>();
143  const T* d_out = data_out.template data<T>();
144 
145  auto shape = segment_grads.dims();
146  shape[0] = N;
147  data_grads->Resize(shape);
148 
149  const SIndex K = segment_grads.dim(0);
150  T* out = data_grads->template mutable_data<T>();
151 
152  if (N == 0) {
153  return true;
154  }
155 
156  TIndex block_size = segment_grads.size_from_dim(1);
157 
158  // Assume the segments are sorted and there are no gaps
159  CAFFE_ENFORCE_EQ(0, s_ids[0], "Indices must be sorted and not have gaps");
160  // repeat the check from forward op
161  CAFFE_ENFORCE_EQ(
162  K - 1, s_ids[N - 1], "Indices must be sorted and not have gaps");
163  for (TIndex i = 0; i < N;) {
164  TIndex start = i;
165  for (++i; i < N && s_ids[start] == s_ids[i]; ++i)
166  ;
167 
168  auto expanded_idx = block_size * start;
169  auto reduced_idx = block_size * s_ids[start];
170  RangeReducerGradient()(
171  block_size,
172  i - start,
173  s_grads + reduced_idx,
174  out + expanded_idx,
175  d_in + expanded_idx,
176  d_out + reduced_idx,
177  &context_);
178 
179  // check correctness of the next segment
180  if (i < N) {
181  CAFFE_ENFORCE_EQ(
182  s_ids[start] + 1,
183  s_ids[i],
184  "Indices must be sorted and not have gaps");
185  }
186  }
187  return true;
188  }
189 
190  static constexpr int kNumInputs = 4;
191  INPUT_TAGS(DATA_IN, DATA_OUT, SEGMENT_GRADS, SEGMENT_IDS);
192 };
193 
194 template <typename T, typename SIndex, typename Context, typename ReducerDef>
196  using OpDef = ReducerDef;
197  static constexpr const char* basename = "SortedSegmentRange";
198  static constexpr const char* doc = R"DOC(
199 Applies '{op}' to each segment of input tensor. In order to allow for more
200 efficient implementation of '{op}', the input segments have to be contiguous
201 and non-empty.
202 
203 SEGMENT_IDS is a vector that maps each of the first dimension slices of the
204 DATA to a particular group (segment). Values belonging to the same segment are
205 aggregated together.
206 
207 The first dimension of the output is equal to the number of input segments,
208 i.e. `SEGMENT_IDS[-1]+1`. Other dimensions are inherited from the input tensor.
209 
210 {op_doc}
211  )DOC";
212  static void PopulateSchema(OpSchema& schema) {
213  schema.Input(0, "DATA", "Input tensor to be aggregated");
214  schema.Input(
215  1,
216  "SEGMENT_IDS",
217  "Vector with the same length as the first dimension of DATA "
218  "and values in the range 0..K-1 and in increasing order that "
219  "maps each slice of DATA to one of the segments");
220  schema.Output(
221  0,
222  "OUTPUT",
223  "Aggregated tensor with the first dimension of K and the "
224  "other dimentsions inherited from DATA");
225  }
227  T,
228  SIndex,
229  Context,
230  typename ReducerDef::template Reducer<T, Context>>;
232  T,
233  SIndex,
234  Context,
235  typename ReducerDef::template ReducerGradient<T, Context>>;
236  struct GetGradient : public GradientMakerBase {
237  using GradientMakerBase::GradientMakerBase;
238  vector<OperatorDef> GetGradientDefs() override {
239  return SingleGradientDef(
240  string(basename) + ReducerDef::name + "Gradient",
241  "",
242  vector<string>{I(0), O(0), GO(0), I(1)},
243  // no gradient on segment_ids!
244  vector<string>{GI(0)});
245  }
246  };
247 };
248 
250 // Incremental reducer ops: assume that reducer consumes pieces of data one by
251 // one. Also, supports additional arguments passed to reducer, e.g. scalers for
252 // weighted sum.
253 //
254 // Note: in current implementation additional inputs are considered auxiliary
255 // constants and have limitations:
256 // - there is no gradient computation for auxiliary inputs
257 // - auxiliary inputs aren't affected by fused embedding lookup in operations
258 // like sparse_sorted_segment
260 
277 template <
278  typename T,
279  class Context,
280  class Reducer,
281  bool FirstDim,
282  class InputAccessor = BaseInputAccessor<T>>
283 class AbstractReduceFrontOrBackOp : public Operator<Context> {
284  public:
285  USE_OPERATOR_CONTEXT_FUNCTIONS;
286 
287  AbstractReduceFrontOrBackOp(const OperatorDef& operator_def, Workspace* ws)
288  : Operator<Context>(operator_def, ws),
289  OP_SINGLE_ARG(int, "num_reduce_dim", num_reduce_dims_, 1) {}
290 
291  bool RunOnDevice() override {
292  auto& data = Input(0);
293  // If more complicated fixed size logic becomes necessary, it can be moved
294  // to the reducer class
295  TIndex in_block_size = FirstDim
296  ? data.size_from_dim(num_reduce_dims_)
297  : data.size_to_dim(data.ndim() - num_reduce_dims_);
299  this, in_block_size);
300  }
301 
302  template <int FixedSize>
303  bool DoRunWithValue() {
304  auto& data = Input(0);
305  auto* output = Output(0);
306 
307  CAFFE_ENFORCE_LE(num_reduce_dims_, data.ndim());
308 
309  typename Reducer::Meta ctx(FirstDim);
310  ctx.observeInput(0, data, num_reduce_dims_);
311  for (int i = 1; i < Reducer::kInputCount; ++i) {
312  auto& aux_in = Input(i);
313  ctx.observeInput(i, aux_in, num_reduce_dims_);
314  }
315 
316  OPERATOR_NEEDS_FEATURE(
317  inputAccessor_.observeInput(data),
318  "Unsupported input type: ",
319  data.meta().name(),
320  ".");
321 
322  vector<TIndex> shape;
323  ctx.appendOutputShape(&shape);
324  output->Resize(shape);
325 
326  T* out = output->template mutable_data<T>();
327 
328  const int block_size = FirstDim
329  ? data.size_from_dim(num_reduce_dims_)
330  : data.size_from_dim(data.ndim() - num_reduce_dims_);
331 
332  const int num_blocks = block_size > 0 ? data.size() / block_size : 0;
333 
334  Reducer r(ctx, out, &context_);
335  for (TIndex i = 0; i < num_blocks; ++i) {
336  r.template process<FixedSize>(
337  ctx, inputAccessor_.getBlockPtr(block_size, i), i, &context_);
338  }
339  r.template finish<FixedSize>(ctx, &context_);
340  return true;
341  }
342 
343  static constexpr int kNumInputs = Reducer::kInputCount;
344 
345  private:
346  int num_reduce_dims_;
347  InputAccessor inputAccessor_;
348 };
349 
350 template <
351  typename T,
352  class Context,
353  class ReducerGradient,
354  bool FirstDim = true>
356  public:
357  USE_OPERATOR_CONTEXT_FUNCTIONS;
358 
360  const OperatorDef& operator_def,
361  Workspace* ws)
362  : Operator<Context>(operator_def, ws),
363  OP_SINGLE_ARG(int, "num_reduce_dim", num_reduce_dims_, 1) {}
364 
365  bool RunOnDevice() override {
366  // If more complicated fixed size logic becomes necessary, it can be moved
367  // to the reducer class
368  TIndex grad_block_size = Input(REDUCTION_GRAD).size();
370  this, grad_block_size);
371  }
372 
373  template <int FixedSize>
374  bool DoRunWithValue() {
375  auto& reduction_grad = Input(REDUCTION_GRAD);
376  auto& source_shape = OperatorBase::Input<TensorCPU>(SOURCE_SHAPE);
377 
378  auto* data_grads = Output(0);
379 
380  typename ReducerGradient::Meta ctx(reduction_grad, 0, FirstDim);
381  for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) {
382  auto& aux_in = Input(i);
383  ctx.observeOriginalInput(
384  ReducerGradient::originalInputs()[i],
385  aux_in,
386  nullptr, /*no grad*/
387  num_reduce_dims_);
388  }
389 
390  const T* r_grad = reduction_grad.template data<T>();
391 
392  CAFFE_ENFORCE_LE(num_reduce_dims_, source_shape.size());
393 
394  vector<TIndex> shape(
395  source_shape.template data<TIndex>(),
396  source_shape.template data<TIndex>() + source_shape.size());
397 
398  data_grads->Resize(shape);
399 
400  TIndex block_size = FirstDim
401  ? data_grads->size_from_dim(num_reduce_dims_)
402  : data_grads->size_from_dim(data_grads->ndim() - num_reduce_dims_);
403  TIndex block_num = block_size > 0 ? data_grads->size() / block_size : 0;
404 
405  T* out = data_grads->template mutable_data<T>();
406 
407  ReducerGradient r(ctx, r_grad, &context_);
408  for (TIndex i = 0; i < block_num; ++i) {
409  r.template fillGrad<FixedSize>(
410  ctx,
411  out + block_size * i,
412  i,
413  &context_,
414  FirstDim ? block_num : block_size);
415  }
416  return true;
417  }
418 
419  static constexpr int kNumInputs =
420  ReducerGradient::originalInputs().size() + 2;
421  enum _InputTags {
422  REDUCTION_GRAD = ReducerGradient::originalInputs().size(),
423  SOURCE_SHAPE
424  };
425 
426  private:
427  int num_reduce_dims_;
428 };
429 
430 template <typename T, typename Context, typename ReducerDef>
432  using OpDef = ReducerDef;
433  static constexpr const char* basename = "ReduceFront";
434  static constexpr const char* doc = R"DOC(
435 Reduces the input tensor along the first dimension of the input tensor by
436 applying '{op}'. This op acts in a similar way to SortedSegment{op} and
437 UnsortedSegment{op} but as if all input slices belong to a single segment.
438 
439 {op_doc}
440  )DOC";
441  static void PopulateSchema(OpSchema& schema) {
442  schema.Input(
443  0, "DATA", "Input tensor to be reduced on the first dimension");
444  schema.TensorInferenceFunction([](const OperatorDef& def,
445  const vector<TensorShape>& in) {
446  CAFFE_ENFORCE_EQ(1, in.size());
447  ArgumentHelper helper(def);
448  int num_reduce_dims = helper.GetSingleArgument<int>("num_reduce_dim", 1);
449  typename ReducerDef::template Reducer<T, Context>::Meta ctx(true);
450  vector<TIndex> out_dims = ctx.getOutputShape(in[0], num_reduce_dims);
451  return vector<TensorShape>{
452  CreateTensorShape(out_dims, in[0].data_type())};
453  });
454  ReducerDef::PopulateSchema(schema);
455  }
456  using ReducerGradient =
457  typename ReducerDef::template ReducerGradient<T, Context>;
459  T,
460  Context,
461  typename ReducerDef::template Reducer<T, Context>,
462  true>;
463  using BackwardOp =
465  struct GetGradient : public GradientMakerBase {
466  using GradientMakerBase::GradientMakerBase;
467  vector<OperatorDef> GetGradientDefs() override {
468  // Have utility function generating these names?
469  string tmp_dims = "_" + O(0) + "_dims";
470 
471  vector<string> grad_ins;
472  for (const int i : ReducerGradient::originalInputs()) {
473  grad_ins.push_back(I(i));
474  }
475  grad_ins.push_back(GO(0));
476  grad_ins.push_back(tmp_dims);
477 
478  vector<Argument> args;
479  if (ArgumentHelper::HasArgument(def_, "num_reduce_dim")) {
480  args.push_back(GetArgument(def_, "num_reduce_dim"));
481  }
482  // FIXME: pass in num_reduce_dims?!
483  return vector<OperatorDef>{
484  CreateOperatorDef(
485  "Shape", "", vector<string>{I(0)}, vector<string>{tmp_dims}),
486  CreateOperatorDef(
487  string(basename) + ReducerDef::name + "Gradient",
488  "",
489  grad_ins,
490  // no gradient on auxiliary inputs for now
491  vector<string>{GI(0)}),
492  };
493  }
494  };
495 };
496 
497 template <typename T, typename Context, typename ReducerDef>
499  using OpDef = ReducerDef;
500  static constexpr const char* basename = "ReduceBack";
501  static constexpr const char* doc = R"DOC(
502 Reduces the input tensor along the last dimension of the input tensor by
503 applying '{op}'. This op acts in a similar way to SortedSegment{op} and
504 UnsortedSegment{op} but as if all input slices belong to a single segment.
505 
506 {op_doc}
507  )DOC";
508  static void PopulateSchema(OpSchema& schema) {
509  schema.Input(
510  0, "DATA", "Input tensor to be reduced on the first dimension");
511  schema.TensorInferenceFunction([](const OperatorDef& def,
512  const vector<TensorShape>& in) {
513  CAFFE_ENFORCE_EQ(1, in.size());
514  ArgumentHelper helper(def);
515  int num_reduce_dims = helper.GetSingleArgument<int>("num_reduce_dim", 1);
516  typename ReducerDef::template Reducer<T, Context>::Meta ctx(false);
517  vector<TIndex> out_dims = ctx.getOutputShape(in[0], num_reduce_dims);
518  return vector<TensorShape>{
519  CreateTensorShape(out_dims, in[0].data_type())};
520  });
521  ReducerDef::PopulateSchema(schema);
522  }
523  using ReducerGradient =
524  typename ReducerDef::template ReducerGradient<T, Context>;
526  T,
527  Context,
528  typename ReducerDef::template Reducer<T, Context>,
529  false>;
530  using BackwardOp =
532  struct GetGradient : public GradientMakerBase {
533  using GradientMakerBase::GradientMakerBase;
534  vector<OperatorDef> GetGradientDefs() override {
535  // Have utility function generating these names?
536  string tmp_dims = "_" + O(0) + "_dims";
537 
538  vector<string> grad_ins;
539  for (const int i : ReducerGradient::originalInputs()) {
540  grad_ins.push_back(I(i));
541  }
542  grad_ins.push_back(GO(0));
543  grad_ins.push_back(tmp_dims);
544 
545  vector<Argument> args;
546  if (ArgumentHelper::HasArgument(def_, "num_reduce_dim")) {
547  args.push_back(GetArgument(def_, "num_reduce_dim"));
548  }
549  // FIXME: pass in num_reduce_dims?!
550  return vector<OperatorDef>{
551  CreateOperatorDef(
552  "Shape", "", vector<string>{I(0)}, vector<string>{tmp_dims}),
553  CreateOperatorDef(
554  string(basename) + ReducerDef::name + "Gradient",
555  "",
556  grad_ins,
557  // no gradient on auxiliary inputs for now
558  vector<string>{GI(0)}),
559  };
560  }
561  };
562 };
563 
586 template <
587  typename T,
588  typename SIndex,
589  class Context,
590  class Reducer,
591  bool SparseFused = true,
592  class InputAccessor = BaseInputAccessor<T>>
593 class AbstractSortedSegmentOp : public Operator<Context> {
594  public:
595  USE_OPERATOR_CONTEXT_FUNCTIONS;
596  USE_SIMPLE_CTOR_DTOR(AbstractSortedSegmentOp);
597 
598  bool RunOnDevice() override {
599  if (SparseFused) {
601  this, Input(INDICES));
602  } else {
603  // type doesn't matter
604  return DoRunWithType<TIndex>();
605  }
606  }
607 
608  template <typename IndexType>
609  bool DoRunWithType() {
610  // If more complicated fixed size logic becomes necessary, it can be moved
611  // to the reducer class
612  TIndex in_block_size = Input(0).size_from_dim(1);
614  this, in_block_size);
615  }
616 
617  template <typename IndexType, int FixedSize>
618  bool DoRunWithValue() {
619  auto& dataInput = Input(0);
620  auto& segment_ids = Input(SEGMENT_IDS);
621  auto* output = Output(0);
622 
623  CAFFE_ENFORCE_EQ(1, segment_ids.ndim(), "SEGMENT_IDS must be a vector");
624  TIndex N = segment_ids.dim(0);
625  const TIndex M = dataInput.dim(0);
626 
627  const IndexType* idxs;
628  if (SparseFused) { // static if
629  auto& indices = Input(INDICES);
630  CAFFE_ENFORCE_EQ(1, indices.ndim(), "INDICES must be a vector");
631  CAFFE_ENFORCE_EQ(
632  N,
633  indices.dim(0),
634  "SEGMENT_IDS must have the same length as INDICES");
635  idxs = indices.template data<IndexType>();
636  } else {
637  CAFFE_ENFORCE_EQ(
638  N, M, "DATA must have the same first dimension as SEGMENT_IDS");
639  }
640 
641  // It would probably look nicer with varargs templates but it's too much
642  // metaprogramming
643  typename Reducer::Meta ctx;
644  ctx.observeInput(0, dataInput, 1);
645  for (int i = 1; i < Reducer::kInputCount; ++i) {
646  auto& aux_in = Input(i);
647  CAFFE_ENFORCE_EQ(
648  N,
649  aux_in.dim(0),
650  "Input ",
651  i,
652  " must have the same first dim as SEGMENT_IDS");
653  ctx.observeInput(i, aux_in, 1);
654  }
655 
656  OPERATOR_NEEDS_FEATURE(
657  inputAccessor_.observeInput(dataInput),
658  "Unsupported input type: ",
659  dataInput.meta().name(),
660  ".");
661 
662  const SIndex* s_ids = segment_ids.template data<SIndex>();
663 
664  const SIndex K = N > 0 ? s_ids[N - 1] + 1 : 0;
665  vector<TIndex> shape;
666  shape.push_back(K);
667  ctx.appendOutputShape(&shape);
668  output->Resize(shape);
669 
670  T* out = output->template mutable_data<T>();
671  if (N == 0) {
672  return true;
673  }
674  TIndex in_block_size = dataInput.size_from_dim(1);
675  TIndex out_block_size = output->size_from_dim(1);
676 
677  // Assume the segments are sorted and there are no gaps
678  CAFFE_ENFORCE_EQ(0, s_ids[0], "Indices must be sorted and not have gaps");
679  for (TIndex i = 0; i < N;) {
680  TIndex start = i;
681 
682  Reducer r(ctx, out + out_block_size * s_ids[start], &context_);
683  for (; i < N && s_ids[start] == s_ids[i]; ++i) {
684  IndexType idx;
685  if (SparseFused) { // static if
686  CAFFE_ENFORCE(
687  0 <= idxs[i] && idxs[i] < M,
688  "Index out of bounds: ",
689  idxs[i],
690  ", range 0 to ",
691  M);
692  idx = idxs[i];
693  } else {
694  idx = i;
695  }
696  r.template process<FixedSize>(
697  ctx, inputAccessor_.getBlockPtr(in_block_size, idx), i, &context_);
698  }
699 
700  r.template finish<FixedSize>(ctx, &context_);
701  // check correctness of the next segment
702  if (i < N) {
703  CAFFE_ENFORCE_EQ(
704  s_ids[start] + 1,
705  s_ids[i],
706  "Indices must be sorted and not have gaps");
707  }
708  }
709  return true;
710  }
711 
712  enum {
713  INDICES = Reducer::kInputCount,
714  SEGMENT_IDS = Reducer::kInputCount + (SparseFused ? 1 : 0)
715  };
716  static constexpr int kSelfInputs = SparseFused ? 2 : 1;
717  static constexpr int kNumInputs = Reducer::kInputCount + kSelfInputs;
718 
719  private:
720  InputAccessor inputAccessor_;
721 };
722 
723 // Gradient actually doesn't depend on whether sparse lookup is fused or not
724 template <typename T, typename SIndex, class Context, class ReducerGradient>
725 class AbstractSortedSegmentGradientOp : public Operator<Context> {
726  public:
727  USE_OPERATOR_CONTEXT_FUNCTIONS;
728  USE_SIMPLE_CTOR_DTOR(AbstractSortedSegmentGradientOp);
729 
730  bool RunOnDevice() override {
731  // If more complicated fixed size logic becomes necessary, it can be moved
732  // to the reducer class
733  TIndex grad_block_size = Input(SEGMENT_GRADS).size_from_dim(1);
735  this, grad_block_size);
736  }
737 
738  template <int FixedSize>
739  bool DoRunWithValue() {
740  auto& segment_grads = Input(SEGMENT_GRADS);
741  auto& segment_ids = Input(SEGMENT_IDS);
742  auto* data_grads = Output(0);
743 
744  CAFFE_ENFORCE_EQ(1, segment_ids.ndim(), "SEGMENT_IDS must be a vector");
745  TIndex N = segment_ids.dim(0);
746 
747  typename ReducerGradient::Meta ctx(segment_grads, 1);
748  for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) {
749  auto& aux_in = Input(i);
750  CAFFE_ENFORCE_EQ(
751  N,
752  aux_in.dim(0),
753  "Input ",
754  i,
755  " must have the same first dim as SEGMENT_IDS");
756  ctx.observeOriginalInput(
757  ReducerGradient::originalInputs()[i], aux_in, nullptr /*no grad*/, 1);
758  }
759 
760  const SIndex* s_ids = segment_ids.template data<SIndex>();
761  const T* s_grads = segment_grads.template data<T>();
762 
763  vector<TIndex> shape;
764  shape.push_back(N);
765  ctx.appendGradShape(&shape);
766  data_grads->Resize(shape);
767 
768  TIndex d_block_size = data_grads->size_from_dim(1);
769  const SIndex K = segment_grads.dim(0);
770  TIndex s_block_size = segment_grads.size_from_dim(1);
771  T* out = data_grads->template mutable_data<T>();
772 
773  if (N == 0) {
774  return true;
775  }
776 
777  // Assume the segments are sorted and there are no gaps
778  CAFFE_ENFORCE_EQ(0, s_ids[0], "Indices must be sorted and not have gaps");
779  // repeat the check from forward op
780  CAFFE_ENFORCE_EQ(
781  K - 1, s_ids[N - 1], "Indices must be sorted and not have gaps");
782  for (TIndex i = 0; i < N;) {
783  TIndex start = i;
784  TIndex end = start;
785 
786  if (ReducerGradient::computeLength()) {
787  for (; end < N && s_ids[start] == s_ids[end]; ++end) {
788  }
789  }
790 
791  ReducerGradient r(ctx, s_grads + s_block_size * s_ids[start], &context_);
792  for (; i < N && s_ids[start] == s_ids[i]; ++i) {
793  r.template fillGrad<FixedSize>(
794  ctx, out + d_block_size * i, i, &context_, end - start);
795  }
796 
797  // check correctness of the next segment
798  if (i < N) {
799  CAFFE_ENFORCE_EQ(
800  s_ids[start] + 1,
801  s_ids[i],
802  "Indices must be sorted and not have gaps");
803  }
804  }
805  return true;
806  }
807 
808  // Input layout:
809  // orig_arg1, orig_arg2, ..., orig_argN, SEGMENT_GRADS, SEGMENT_IDS
810  // orig_argXs represent original op's inputs and will be passed to the reducer
811  // directly
812  static constexpr int kNumInputs =
813  ReducerGradient::originalInputs().size() + 2;
814  enum _InputTags {
815  SEGMENT_GRADS = ReducerGradient::originalInputs().size(),
816  SEGMENT_IDS
817  };
818 };
819 
820 // base implementation of sorted/unsorted sparse/non-sparse gradient computation
821 template <
822  typename ForwardOp,
823  typename ReducerDef,
824  typename ReducerGradient,
825  bool Sorted,
826  bool SparseFused>
828  using GradientMakerBase::GradientMakerBase;
829  vector<OperatorDef> GetGradientDefs() override {
830  CAFFE_ENFORCE(
831  !ReducerGradient::requiresDataInput(Def()),
832  "grads on aux inputs are not yet implemented for Segment operators.");
833  vector<string> grad_ins;
834  for (const int i : ReducerGradient::originalInputs()) {
835  grad_ins.push_back(I(i));
836  }
837  grad_ins.push_back(GO(0));
838  grad_ins.push_back(I(ForwardOp::SEGMENT_IDS));
839  vector<OperatorDef> r{CreateOperatorDef(
840  string(Sorted ? "SortedSegment" : "UnsortedSegment") +
841  ReducerDef::name + "Gradient",
842  "",
843  grad_ins,
844  // no gradient on segment_ids or auxiliary inputs for now
845  vector<string>{SparseFused ? GI_V(0) : GI(0)})};
846  if (SparseFused) {
847  SetSparse(0, I(ForwardOp::INDICES), GI_V(0));
848  }
849  return r;
850  }
851 };
852 
853 template <typename T, typename SIndex, typename Context, typename ReducerDef>
855  using OpDef = ReducerDef;
856  static constexpr const char* basename = "SortedSegment";
857  static constexpr const char* doc = R"DOC(
858 Applies '{op}' to each segment of input tensor. Segments need to be sorted and
859 contiguous. See also UnsortedSegment{op} that doesn't have this requirement.
860 
861 SEGMENT_IDS is a vector that maps each of the first dimension slices of the
862 DATA to a particular group (segment). Values belonging to the same segment are
863 aggregated together.
864 
865 The first dimension of the output is equal to the number of input segments,
866 i.e. `SEGMENT_IDS[-1]+1`. Other dimensions are inherited from the input tensor.
867 
868 {op_doc}
869  )DOC";
870  static void PopulateSchema(OpSchema& schema) {
871  schema.Input(0, "DATA", "Input tensor, slices of which are aggregated.");
872  schema.Input(
873  Reducer::kInputCount,
874  "SEGMENT_IDS",
875  "Vector with the same length as the first dimension of DATA "
876  "and values in the range 0..K-1 and in increasing order that "
877  "maps each slice of DATA to one of the segments");
878  schema.Output(
879  0,
880  "OUTPUT",
881  "Aggregated output tensor. Has the first dimension of K "
882  "(the number of segments).");
883  ReducerDef::PopulateSchema(schema);
884  }
885  using Reducer = typename ReducerDef::template Reducer<T, Context>;
886  using ReducerGradient =
887  typename ReducerDef::template ReducerGradient<T, Context>;
889  using BackwardOp =
892  ForwardOp,
893  ReducerDef,
894  ReducerGradient,
895  true /*Sorted*/,
896  false /*SparseFused*/>;
897 };
898 
899 template <typename T, typename SIndex, typename Context, typename ReducerDef>
901  using OpDef = ReducerDef;
902  static constexpr const char* basename = "SparseSortedSegment";
903  static constexpr const char* doc = R"DOC(
904 Pulls in slices of the input tensor, groups them into segments and applies
905 '{op}' to each segment. Segments need to be sorted and contiguous. See also
906 SparseUnsortedSegment{op} that doesn't have this requirement.
907 
908 This op is basically Gather and SortedSegment{op} fused together.
909 
910 INDICES should contain integers in range 0..N-1 where N is the first dimension
911 of DATA. INDICES represent which slices of DATA need to be pulled in.
912 
913 SEGMENT_IDS is a vector that maps each referenced slice of the DATA to a
914 particular group (segment). Values belonging to the same segment are aggregated
915 together. SEGMENT_IDS should have the same dimension as INDICES.
916 
917 The first dimension of the output is equal to the number of input segments,
918 i.e. `SEGMENT_IDS[-1]+1`. Other dimensions are inherited from the input tensor.
919 
920 {op_doc}
921  )DOC";
922  static void PopulateSchema(OpSchema& schema) {
923  schema.Input(0, "DATA", "Input tensor, slices of which are aggregated.");
924  schema.Input(
925  Reducer::kInputCount,
926  "INDICES",
927  "Integer vector containing indices of the first dimension of DATA for "
928  "the slices that are being aggregated");
929  schema.Input(
930  Reducer::kInputCount + 1,
931  "SEGMENT_IDS",
932  "Vector with the same length as INDICES and values in the range "
933  "0..K-1 and in increasing order that maps each slice of DATA referenced"
934  " by INDICES to one of the segments");
935  schema.Output(
936  0,
937  "OUTPUT",
938  "Aggregated output tensor. Has the first dimension of K "
939  "(the number of segments).");
940  ReducerDef::PopulateSchema(schema);
941  }
942  using Reducer = typename ReducerDef::template Reducer<T, Context>;
943  using ReducerGradient =
944  typename ReducerDef::template ReducerGradient<T, Context>;
946  // TODO(dzhulgakov): we're registering the same class twice here,
947  // consider avoiding op duplication here
948  using BackwardOp =
951  ForwardOp,
952  ReducerDef,
953  ReducerGradient,
954  true /*Sorted*/,
955  true /*SparseFused*/>;
956 };
957 
987 template <
988  typename T,
989  typename SIndex,
990  class Context,
991  class Reducer,
992  bool SparseFused = true,
993  class InputAccessor = BaseInputAccessor<T>>
994 class AbstractUnsortedSegmentOp : public Operator<Context> {
995  public:
996  USE_OPERATOR_CONTEXT_FUNCTIONS;
997 
998  AbstractUnsortedSegmentOp(const OperatorDef& operator_def, Workspace* ws)
999  : Operator<Context>(operator_def, ws),
1000  OP_SINGLE_ARG(int, "num_segments", num_segments_, -1) {}
1001 
1002  bool RunOnDevice() override {
1003  if (SparseFused) {
1005  this, Input(INDICES));
1006  } else {
1007  // type doesn't matter
1008  return DoRunWithType<TIndex>();
1009  }
1010  }
1011 
1012  template <typename IndexType>
1013  bool DoRunWithType() {
1014  // If more complicated fixed size logic becomes necessary, it can be moved
1015  // to the reducer class
1016  TIndex in_block_size = Input(0).size_from_dim(1);
1018  this, in_block_size);
1019  }
1020 
1021  template <typename IndexType, int FixedSize>
1022  bool DoRunWithValue() {
1023  auto& data = Input(0);
1024  auto& segment_ids = Input(SEGMENT_IDS);
1025  auto* output = Output(0);
1026 
1027  CAFFE_ENFORCE_EQ(1, segment_ids.ndim(), "SEGMENT_IDS must be a vector");
1028  TIndex N = segment_ids.dim(0);
1029  const TIndex M = data.dim(0);
1030 
1031  const IndexType* idxs;
1032  if (SparseFused) { // static if
1033  auto& indices = Input(INDICES);
1034  CAFFE_ENFORCE_EQ(1, indices.ndim(), "INDICES must be a vector");
1035  CAFFE_ENFORCE_EQ(
1036  N,
1037  indices.dim(0),
1038  "SEGMENT_IDS must have the same length as INDICES");
1039  idxs = indices.template data<IndexType>();
1040  } else {
1041  CAFFE_ENFORCE_EQ(
1042  N, M, "DATA must have the same first dimension as SEGMENT_IDS");
1043  }
1044 
1045  // It would probably look nicer with varargs templates but it's too much
1046  // metaprogramming
1047  typename Reducer::Meta ctx;
1048  ctx.observeInput(0, data, 1);
1049  for (int i = 1; i < Reducer::kInputCount; ++i) {
1050  auto& aux_in = Input(i);
1051  CAFFE_ENFORCE_EQ(
1052  N,
1053  aux_in.dim(0),
1054  "Input ",
1055  i,
1056  " must have the same first dim as SEGMENT_IDS");
1057  ctx.observeInput(i, aux_in, 1);
1058  }
1059 
1060  const SIndex* s_ids = segment_ids.template data<SIndex>();
1061  OPERATOR_NEEDS_FEATURE(
1062  inputAccessor_.observeInput(data),
1063  "Unsupported input type: ",
1064  data.meta().name(),
1065  ".");
1066 
1067  // determine the number of segments
1068  SIndex K;
1069  if (num_segments_ != -1) {
1070  K = num_segments_;
1071  } else {
1072  K = 0;
1073  for (TIndex i = 0; i < N; ++i) {
1074  K = std::max(K, s_ids[i] + 1);
1075  }
1076  }
1077 
1078  vector<TIndex> shape;
1079  shape.push_back(K);
1080  ctx.appendOutputShape(&shape);
1081  output->Resize(shape);
1082 
1083  TIndex in_block_size = data.size_from_dim(1);
1084  TIndex out_block_size = output->size_from_dim(1);
1085  T* out = output->template mutable_data<T>();
1086 
1087  reducers_.clear();
1088  reducers_.reserve(K);
1089  for (TIndex i = 0; i < K; ++i) {
1090  reducers_.emplace_back(ctx, out + out_block_size * i, &context_);
1091  }
1092 
1093  for (TIndex i = 0; i < N; ++i) {
1094  auto s_id = s_ids[i];
1095  CAFFE_ENFORCE(
1096  0 <= s_id && s_id < K,
1097  "Segment id out of range: ",
1098  s_id,
1099  ", range 0 to ",
1100  K);
1101  IndexType idx;
1102  if (SparseFused) { // static if
1103  CAFFE_ENFORCE(
1104  0 <= idxs[i] && idxs[i] < M,
1105  "Index out of bounds: ",
1106  idxs[i],
1107  ", range 0 to ",
1108  M);
1109  idx = idxs[i];
1110  } else {
1111  idx = i;
1112  }
1113  reducers_[s_id].template process<FixedSize>(
1114  ctx, inputAccessor_.getBlockPtr(in_block_size, idx), i, &context_);
1115  }
1116 
1117  for (TIndex i = 0; i < K; ++i) {
1118  reducers_[i].template finish<FixedSize>(ctx, &context_);
1119  }
1120  // call reducers destructors (if there is any)
1121  reducers_.clear();
1122  return true;
1123  }
1124 
1125  enum {
1126  INDICES = Reducer::kInputCount,
1127  SEGMENT_IDS = Reducer::kInputCount + (SparseFused ? 1 : 0)
1128  };
1129  static constexpr int kSelfInputs = SparseFused ? 2 : 1;
1130  static constexpr int kNumInputs = Reducer::kInputCount + kSelfInputs;
1131 
1132  private:
1133  TIndex num_segments_;
1134  // member field to reuse memory
1135  vector<Reducer> reducers_;
1136  InputAccessor inputAccessor_;
1137 };
1138 
1139 // Gradient actually doesn't depend on whether sparse lookup is fused or not
1140 template <typename T, typename SIndex, class Context, class ReducerGradient>
1142  public:
1143  USE_OPERATOR_CONTEXT_FUNCTIONS;
1144  USE_SIMPLE_CTOR_DTOR(AbstractUnsortedSegmentGradientOp);
1145 
1146  bool RunOnDevice() override {
1147  // If more complicated fixed size logic becomes necessary, it can be moved
1148  // to the reducer class
1149  TIndex grad_block_size = Input(SEGMENT_GRADS).size_from_dim(1);
1151  this, grad_block_size);
1152  }
1153 
1154  template <int FixedSize>
1155  bool DoRunWithValue() {
1156  auto& segment_grads = Input(SEGMENT_GRADS);
1157  auto& segment_ids = Input(SEGMENT_IDS);
1158  auto* data_grads = Output(0);
1159 
1160  CAFFE_ENFORCE_EQ(1, segment_ids.ndim(), "SEGMENT_IDS must be a vector");
1161  TIndex N = segment_ids.dim(0);
1162 
1163  typename ReducerGradient::Meta ctx(segment_grads, 1);
1164  for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) {
1165  auto& aux_in = Input(i);
1166  CAFFE_ENFORCE_EQ(
1167  N,
1168  aux_in.dim(0),
1169  "Input ",
1170  i,
1171  " must have the same first dim as SEGMENT_IDS");
1172  ctx.observeOriginalInput(
1173  ReducerGradient::originalInputs()[i], aux_in, nullptr /*no grad*/, 1);
1174  }
1175 
1176  const SIndex* s_ids = segment_ids.template data<SIndex>();
1177  const T* s_grads = segment_grads.template data<T>();
1178 
1179  vector<TIndex> shape;
1180  shape.push_back(N);
1181  ctx.appendGradShape(&shape);
1182  data_grads->Resize(shape);
1183 
1184  TIndex d_block_size = data_grads->size_from_dim(1);
1185  const SIndex K = segment_grads.dim(0);
1186  TIndex s_block_size = segment_grads.size_from_dim(1);
1187  T* out = data_grads->template mutable_data<T>();
1188 
1189  if (ReducerGradient::computeLength()) {
1190  segment_length_.resize(K, 0);
1191  for (int i = 0; i < N; ++i) {
1192  auto s_id = s_ids[i];
1193  CAFFE_ENFORCE(
1194  0 <= s_id && s_id < K,
1195  "Segment id out of range: ",
1196  s_id,
1197  ", range 0 to ",
1198  K);
1199  segment_length_[s_ids[i]]++;
1200  }
1201  }
1202 
1203  reducers_.clear();
1204  reducers_.reserve(K);
1205  for (SIndex i = 0; i < K; ++i) {
1206  reducers_.emplace_back(ctx, s_grads + s_block_size * i, &context_);
1207  }
1208 
1209  for (TIndex i = 0; i < N; ++i) {
1210  auto s_id = s_ids[i];
1211  if (ReducerGradient::computeLength()) {
1212  reducers_[s_id].template fillGrad<FixedSize>(
1213  ctx, out + d_block_size * i, i, &context_, segment_length_[s_id]);
1214  } else {
1215  reducers_[s_id].template fillGrad<FixedSize>(
1216  ctx, out + d_block_size * i, i, &context_, 0);
1217  }
1218  }
1219  // call reducers destructors (if there is any)
1220  reducers_.clear();
1221  return true;
1222  }
1223 
1224  // Input layout:
1225  // orig_arg1, orig_arg2, ..., orig_argN, SEGMENT_GRADS, SEGMENT_IDS
1226  // orig_argXs represent original op's inputs and will be passed to the reducer
1227  // directly
1228  static constexpr int kNumInputs =
1229  ReducerGradient::originalInputs().size() + 2;
1230  enum _InputTags {
1231  SEGMENT_GRADS = ReducerGradient::originalInputs().size(),
1232  SEGMENT_IDS
1233  };
1234 
1235  private:
1236  // member field to reuse memory
1237  vector<ReducerGradient> reducers_;
1238  vector<int> segment_length_;
1239 };
1240 
1241 template <typename T, typename SIndex, typename Context, typename ReducerDef>
1243  using OpDef = ReducerDef;
1244  static constexpr const char* basename = "UnsortedSegment";
1245  static constexpr const char* doc = R"DOC(
1246 Applies '{op}' to each segment of input tensor. Segments ids can appear in
1247 arbitrary order (unlike in SortedSegment{op}).
1248 
1249 SEGMENT_IDS is a vector that maps each of the first dimension slices of the
1250 DATA to a particular group (segment). Values belonging to the same segment are
1251 aggregated together.
1252 
1253 If `num_segments` argument is passed it would be used as a first dimension for
1254 the output. Otherwise, it'd be dynamically calculated from as the max value of
1255 SEGMENT_IDS plus one. Other output dimensions are inherited from the input
1256 tensor.
1257 
1258 {op_doc}
1259  )DOC";
1260  static void PopulateSchema(OpSchema& schema) {
1261  schema.Arg(
1262  "num_segments",
1263  "Optional int argument specifying the number of output segments and "
1264  "thus the first dimension of the output");
1265  schema.Input(0, "DATA", "Input tensor, slices of which are aggregated.");
1266  schema.Input(
1267  Reducer::kInputCount,
1268  "SEGMENT_IDS",
1269  "Integer vector with the same length as the first dimension of DATA "
1270  "that maps each slice of DATA to one of the segments");
1271  schema.Output(
1272  0,
1273  "OUTPUT",
1274  "Aggregated output tensor. Has the first dimension of equal to the "
1275  "number of segments.");
1276  ReducerDef::PopulateSchema(schema);
1277  }
1278  using Reducer = typename ReducerDef::template Reducer<T, Context>;
1279  using ReducerGradient =
1280  typename ReducerDef::template ReducerGradient<T, Context>;
1282  T,
1283  SIndex,
1284  Context,
1285  typename ReducerDef::template Reducer<T, Context>,
1286  false>;
1287  using BackwardOp =
1290  ForwardOp,
1291  ReducerDef,
1292  ReducerGradient,
1293  false /*Sorted*/,
1294  false /*SparseFused*/>;
1295 };
1296 
1297 template <typename T, typename SIndex, typename Context, typename ReducerDef>
1299  using OpDef = ReducerDef;
1300  static constexpr const char* basename = "SparseUnsortedSegment";
1301  static constexpr const char* doc = R"DOC(
1302 Pulls in slices of the input tensor, groups them into segments and applies
1303 '{op}' to each segment. Segments ids can appear in arbitrary order (unlike in
1304 SparseSortedSegment{op}).
1305 
1306 This op is basically Gather and UnsortedSegment{op} fused together.
1307 
1308 INDICES should contain integers in range 0..N-1 where N is the first dimension
1309 of DATA. INDICES represent which slices of DATA need to be pulled in.
1310 
1311 SEGMENT_IDS is a vector that maps each referenced slice of the DATA to a
1312 particular group (segment). Values belonging to the same segment are aggregated
1313 together. SEGMENT_IDS should have the same dimension as INDICES.
1314 
1315 If `num_segments` argument is passed it would be used as a first dimension for
1316 the output. Otherwise, it'd be dynamically calculated from as the max value of
1317 SEGMENT_IDS plus one. Other output dimensions are inherited from the input
1318 tensor.
1319 
1320 {op_doc}
1321  )DOC";
1322  static void PopulateSchema(OpSchema& schema) {
1323  schema.Input(0, "DATA", "Input tensor, slices of which are aggregated.");
1324  schema.Input(
1325  Reducer::kInputCount,
1326  "INDICES",
1327  "Integer vector containing indices of the first dimension of DATA for "
1328  "the slices that are being aggregated");
1329  schema.Input(
1330  Reducer::kInputCount + 1,
1331  "SEGMENT_IDS",
1332  "Integer vector with the same length as INDICES that maps each slice "
1333  "of DATA referenced by INDICES to one of the segments");
1334  schema.Output(
1335  0,
1336  "OUTPUT",
1337  "Aggregated output tensor. Has the first dimension of equal to the "
1338  "number of segments.");
1339  ReducerDef::PopulateSchema(schema);
1340  }
1341  using Reducer = typename ReducerDef::template Reducer<T, Context>;
1342  using ReducerGradient =
1343  typename ReducerDef::template ReducerGradient<T, Context>;
1345  // TODO(dzhulgakov): we're registering the same class twice here,
1346  // consider avoiding op duplication here
1347  using BackwardOp =
1350  ForwardOp,
1351  ReducerDef,
1352  ReducerGradient,
1353  false /*Sorted*/,
1354  true /*SparseFused*/>;
1355 };
1356 
1379 // TODO(dzhulgakov): for now it's implemented with incremental reducers because
1380 // of fused sparse support. But using "lengths" representation actually implies
1381 // continuous segments and thus range reducers can be used for non-sparse
1382 // version.
1383 
1384 template <
1385  typename TData,
1386  typename TLengths,
1387  class Context,
1388  class Reducer,
1389  bool SparseFused = true,
1390  class InputAccessor = BaseInputAccessor<TData>>
1391 class AbstractLengthsOp : public Operator<Context> {
1392  public:
1393  USE_OPERATOR_CONTEXT_FUNCTIONS;
1394  USE_SIMPLE_CTOR_DTOR(AbstractLengthsOp);
1395 
1396  bool RunOnDevice() override {
1397  if (SparseFused) {
1399  this, Input(INDICES));
1400  } else {
1401  // type doesn't matter
1402  return DoRunWithType<TIndex>();
1403  }
1404  }
1405 
1406  template <typename IndexType>
1407  bool DoRunWithType() {
1408  // If more complicated fixed size logic becomes necessary, it can be moved
1409  // to the reducer class
1410  TIndex in_block_size = Input(0).size_from_dim(1);
1412  this, in_block_size);
1413  }
1414 
1415  template <typename IndexType, int FixedSize>
1416  bool DoRunWithValue() {
1417  auto& dataInput = Input(0);
1418  auto& lengthsInput = Input(LENGTHS);
1419  auto* output = Output(0);
1420 
1421  CAFFE_ENFORCE_EQ(1, lengthsInput.ndim(), "LENGTHS must be a vector");
1422  const TIndex dataSize = dataInput.dim(0);
1423  // Either first dim the data or how much we pull in indexies from it
1424  TIndex dataToReduceSize;
1425  const TIndex outputSize = lengthsInput.dim(0);
1426 
1427  const IndexType* indices;
1428  if (SparseFused) { // static if
1429  auto& indicesInput = Input(INDICES);
1430  CAFFE_ENFORCE_EQ(1, indicesInput.ndim(), "INDICES must be a vector");
1431  indices = indicesInput.template data<IndexType>();
1432  dataToReduceSize = indicesInput.dim(0);
1433  } else {
1434  dataToReduceSize = dataSize;
1435  }
1436 
1437  typename Reducer::Meta ctx;
1438  ctx.observeInput(0, dataInput, 1);
1439  for (int i = 1; i < Reducer::kInputCount; ++i) {
1440  auto& aux_in = Input(i);
1441  CAFFE_ENFORCE(
1442  dataToReduceSize == aux_in.dim(0),
1443  "Input ",
1444  i,
1445  " must have the same first dim as SEGMENT_IDS");
1446  ctx.observeInput(i, aux_in, 1);
1447  }
1448 
1449  const TLengths* lengths = lengthsInput.template data<TLengths>();
1450 
1451  OPERATOR_NEEDS_FEATURE(
1452  inputAccessor_.observeInput(dataInput),
1453  "Unsupported input type: ",
1454  dataInput.meta().name(),
1455  ".");
1456 
1457  vector<TIndex> shape{outputSize};
1458  ctx.appendOutputShape(&shape);
1459  output->Resize(shape);
1460 
1461  TIndex in_block_size = dataInput.size_from_dim(1);
1462  TIndex out_block_size = output->size_from_dim(1);
1463  TData* out = output->template mutable_data<TData>();
1464 
1465  TIndex dataIndex = 0;
1466  for (TIndex rangeIndex = 0; rangeIndex < outputSize; ++rangeIndex) {
1467  Reducer reducer(ctx, out + out_block_size * rangeIndex, &context_);
1468  for (TIndex start = dataIndex; dataIndex < start + lengths[rangeIndex];
1469  ++dataIndex) {
1470  IndexType idx;
1471  if (SparseFused) { // static if
1472  idx = indices[dataIndex];
1473  CAFFE_ENFORCE(
1474  0 <= idx && idx < dataSize,
1475  "The ",
1476  dataIndex,
1477  "th index from the input indices is out of bounds: ",
1478  idx,
1479  " vs. valid range 0 to ",
1480  dataSize);
1481  } else {
1482  idx = dataIndex;
1483  CAFFE_ENFORCE(
1484  0 <= idx && idx < dataSize,
1485  "When calculating the ",
1486  rangeIndex,
1487  "th output with length=",
1488  lengths[rangeIndex],
1489  ", the index is out of bounds: ",
1490  idx,
1491  " vs. valid range 0 to ",
1492  dataSize);
1493  }
1494 
1495  const TData* input = inputAccessor_.getBlockPtr(in_block_size, idx);
1496  reducer.template process<FixedSize>(ctx, input, dataIndex, &context_);
1497  }
1498  reducer.template finish<FixedSize>(ctx, &context_);
1499  }
1500  CAFFE_ENFORCE(
1501  dataIndex == dataToReduceSize, dataIndex, " != ", dataToReduceSize);
1502 
1503  return true;
1504  }
1505 
1506  enum {
1507  INDICES = Reducer::kInputCount,
1508  LENGTHS = Reducer::kInputCount + (SparseFused ? 1 : 0)
1509  };
1510  static constexpr int kSelfInputs = SparseFused ? 2 : 1;
1511  static constexpr int kNumInputs = Reducer::kInputCount + kSelfInputs;
1512 
1513  private:
1514  InputAccessor inputAccessor_;
1515 };
1516 
1517 /*
1518  * Some notice:
1519  * 1. Gradient actually doesn't depend on whether sparse lookup is fused or not
1520  * 2. INDICES are not used in CPU version, but they are needed in async CUDA
1521  * version. So we register 3 input version for CPU as gradient op for
1522  * GPU/CPU convert. We then register 2 input version for CPU for backward
1523  * compatibility with older nets.
1524  */
1525 template <
1526  typename T,
1527  typename TLengths,
1528  class Context,
1529  class ReducerGradient,
1530  bool GradientNeedIndices = false>
1531 class AbstractLengthsGradientOp : public Operator<Context> {
1532  public:
1533  USE_OPERATOR_CONTEXT_FUNCTIONS;
1534  USE_SIMPLE_CTOR_DTOR(AbstractLengthsGradientOp);
1535 
1536  bool RunOnDevice() override {
1537  // If more complicated fixed size logic becomes necessary, it can be moved
1538  // to the reducer class
1539  TIndex gradBlockSize = Input(SEGMENT_GRADS).size_from_dim(1);
1541  this, gradBlockSize);
1542  }
1543 
1544  template <int FixedSize>
1545  bool DoRunWithValue() {
1546  auto& segmentGradsInput = Input(SEGMENT_GRADS);
1547  auto& lengthsInput = Input(LENGTHS);
1548  auto* dataGradsOutput = Output(0);
1549 
1550  CAFFE_ENFORCE(lengthsInput.ndim() == 1, "LENGTHS must be a vector");
1551  TIndex reducedDataSize = 0;
1552  TIndex numSegments = lengthsInput.dim(0);
1553  CAFFE_ENFORCE(segmentGradsInput.ndim() > 0);
1554  CAFFE_ENFORCE(numSegments == segmentGradsInput.dim(0));
1555  const TLengths* lengths = lengthsInput.template data<TLengths>();
1556  for (TIndex i = 0; i < numSegments; ++i) {
1557  reducedDataSize += lengths[i];
1558  }
1559 
1560  typename ReducerGradient::Meta ctx(segmentGradsInput, 1);
1561  for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) {
1562  auto& aux_in = Input(i);
1563  CAFFE_ENFORCE_EQ(
1564  reducedDataSize,
1565  aux_in.dim(0),
1566  "Input ",
1567  i,
1568  " must have the same first dim as SEGMENT_IDS");
1569  ctx.observeOriginalInput(
1570  ReducerGradient::originalInputs()[i], aux_in, nullptr /*no grad*/, 1);
1571  }
1572 
1573  const T* segmentGrads = segmentGradsInput.template data<T>();
1574 
1575  vector<TIndex> shape;
1576  shape.push_back(reducedDataSize);
1577  ctx.appendGradShape(&shape);
1578  dataGradsOutput->Resize(shape);
1579 
1580  TIndex dataGradsBlockSize = dataGradsOutput->size_from_dim(1);
1581  TIndex segmentBlockSize = segmentGradsInput.size_from_dim(1);
1582  T* dataGrads = dataGradsOutput->template mutable_data<T>();
1583 
1584  TIndex dataIndex = 0;
1585  for (TIndex rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) {
1586  ReducerGradient reducer(
1587  ctx, segmentGrads + segmentBlockSize * rangeIndex, &context_);
1588  for (TIndex start = dataIndex; dataIndex < start + lengths[rangeIndex];
1589  ++dataIndex) {
1590  reducer.template fillGrad<FixedSize>(
1591  ctx,
1592  dataGrads + dataGradsBlockSize * dataIndex,
1593  dataIndex,
1594  &context_,
1595  lengths[rangeIndex]);
1596  }
1597  }
1598  CAFFE_ENFORCE(
1599  dataIndex == reducedDataSize, dataIndex, " != ", reducedDataSize);
1600  return true;
1601  }
1602 
1603  // Input layout:
1604  // orig_arg1, orig_arg2, ..., orig_argN, SEGMENT_GRADS, LENGTHS, INDICES
1605  // orig_argXs represent original op's inputs and will be passed to the reducer
1606  // directly
1607  static constexpr int kNumInputs = ReducerGradient::originalInputs().size() +
1608  2 + (GradientNeedIndices ? 1 : 0);
1609  enum _InputTags {
1610  SEGMENT_GRADS = ReducerGradient::originalInputs().size(),
1611  LENGTHS,
1612  INDICES
1613  };
1614 };
1615 
1616 // Version of gradient that requires the main input and thus needs to receive
1617 // length, indices and other stuff
1618 template <
1619  typename T,
1620  typename TLengths,
1621  class Context,
1622  class ReducerGradient,
1623  bool SparseFused = true,
1624  bool GradientNeedIndices = false>
1626  public:
1627  USE_OPERATOR_CONTEXT_FUNCTIONS;
1628  USE_SIMPLE_CTOR_DTOR(AbstractLengthsWithMainInputGradientOp);
1629 
1630  bool RunOnDevice() override {
1631  if (SparseFused) {
1633  this, Input(INDICES));
1634  } else {
1635  // type doesn't matter
1636  return DoRunWithType<TIndex>();
1637  }
1638  }
1639 
1640  template <typename IndexType>
1641  bool DoRunWithType() {
1642  // If more complicated fixed size logic becomes necessary, it can be moved
1643  // to the reducer class
1644  TIndex in_block_size = Input(SEGMENT_GRADS).size_from_dim(1);
1646  call(this, in_block_size);
1647  }
1648 
1649  template <typename IndexType, int FixedSize>
1650  bool DoRunWithValue() {
1651  auto& dataInput = Input(DATA_INPUT);
1652  auto& segmentGradsInput = Input(SEGMENT_GRADS);
1653  auto& lengthsInput = Input(LENGTHS);
1654  auto* dataGradsOutput = Output(0);
1655 
1656  CAFFE_ENFORCE(lengthsInput.ndim() == 1, "LENGTHS must be a vector");
1657  TIndex numSegments = lengthsInput.dim(0);
1658  CAFFE_ENFORCE(segmentGradsInput.ndim() > 0);
1659  CAFFE_ENFORCE(numSegments == segmentGradsInput.dim(0));
1660  const TLengths* lengths = lengthsInput.template data<TLengths>();
1661 
1662  typename ReducerGradient::Meta ctx(segmentGradsInput, 1);
1663  for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) {
1664  int aux_num = ReducerGradient::originalInputs()[i];
1665  auto& aux_in = Input(i);
1666  auto* aux_grad = aux_num < OutputSize() ? Output(aux_num) : nullptr;
1667  ctx.observeOriginalInput(aux_num, aux_in, aux_grad, 1);
1668  }
1669 
1670  // Either first dim the data or how much we pull in indexies from it
1671  TIndex dataToReduceSize;
1672  const IndexType* indices = nullptr;
1673  if (SparseFused) { // static if
1674  auto& indicesInput = Input(INDICES);
1675  indices = indicesInput.template data<IndexType>();
1676  dataToReduceSize = indicesInput.dim(0);
1677  } else {
1678  dataToReduceSize = dataInput.dim(0);
1679  }
1680 
1681  const T* segmentGrads = segmentGradsInput.template data<T>();
1682 
1683  vector<TIndex> shape;
1684  shape.push_back(dataToReduceSize);
1685  ctx.appendGradShape(&shape);
1686  dataGradsOutput->Resize(shape);
1687 
1688  TIndex dataGradsBlockSize = dataGradsOutput->size_from_dim(1);
1689  TIndex segmentBlockSize = segmentGradsInput.size_from_dim(1);
1690  T* dataGrads = dataGradsOutput->template mutable_data<T>();
1691 
1692  const T* data = dataInput.template data<T>();
1693 
1694  TIndex dataIndex = 0;
1695  for (TIndex rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) {
1696  ReducerGradient reducer(
1697  ctx, segmentGrads + segmentBlockSize * rangeIndex, &context_);
1698  for (TIndex start = dataIndex; dataIndex < start + lengths[rangeIndex];
1699  ++dataIndex) {
1700  IndexType data_pos;
1701  // No range checking, should've been verified in forward pass
1702  if (SparseFused) { // static if
1703  data_pos = indices[dataIndex];
1704  } else {
1705  data_pos = dataIndex;
1706  }
1707  reducer.template fillGradWithMainInput<FixedSize>(
1708  ctx,
1709  data + dataGradsBlockSize * data_pos,
1710  dataGrads + dataGradsBlockSize * dataIndex,
1711  dataIndex,
1712  &context_,
1713  lengths[rangeIndex]);
1714  }
1715  }
1716  return true;
1717  }
1718 
1719  // Input layout:
1720  // orig_arg1, orig_arg2, ..., orig_argN, SEGMENT_GRADS, LENGTHS,
1721  // DATA_INPUT, [INDICES]
1722  // orig_argXs represent original op's inputs and will be passed to the reducer
1723  // directly
1724  static constexpr int kNumInputs = ReducerGradient::originalInputs().size() +
1725  3 + (SparseFused ? 1 : 0) + (GradientNeedIndices ? 1 : 0);
1726  enum _InputTags {
1727  SEGMENT_GRADS = ReducerGradient::originalInputs().size(),
1728  LENGTHS,
1729  DATA_INPUT,
1730  INDICES,
1731  };
1732 };
1733 
1734 // Version of gradient that requires the main input as well as the output of the
1735 // forward op.
1736 template <typename T, typename TLengths, class Context, class ReducerGradient>
1738  : public Operator<Context> {
1739  public:
1740  USE_OPERATOR_CONTEXT_FUNCTIONS;
1742 
1743  bool RunOnDevice() override {
1744  // If more complicated fixed size logic becomes necessary, it can be moved
1745  // to the reducer class.
1746  TIndex in_block_size = Input(SEGMENT_GRADS).size_from_dim(1);
1748  this, in_block_size);
1749  }
1750 
1751  template <int FixedSize>
1752  bool DoRunWithValue() {
1753  auto& dataInput = Input(DATA_INPUT);
1754  auto& segmentGradsInput = Input(SEGMENT_GRADS);
1755  auto& lengthsInput = Input(LENGTHS);
1756  auto& forwardOutputInput = Input(FORWARD_OUTPUT);
1757  auto* dataGradsOutput = Output(0);
1758 
1759  CAFFE_ENFORCE(lengthsInput.ndim() == 1, "LENGTHS must be a vector");
1760  TIndex numSegments = lengthsInput.dim(0);
1761  CAFFE_ENFORCE(segmentGradsInput.ndim() > 0);
1762  CAFFE_ENFORCE(numSegments == segmentGradsInput.dim(0));
1763  const TLengths* lengths = lengthsInput.template data<TLengths>();
1764 
1765  typename ReducerGradient::Meta ctx(segmentGradsInput, 1);
1766  for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) {
1767  int aux_num = ReducerGradient::originalInputs()[i];
1768  auto& aux_in = Input(i);
1769  auto* aux_grad = aux_num < OutputSize() ? Output(aux_num) : nullptr;
1770  ctx.observeOriginalInput(aux_num, aux_in, aux_grad, 1);
1771  }
1772 
1773  CAFFE_ENFORCE(forwardOutputInput.ndim() > 0);
1774  CAFFE_ENFORCE(numSegments == forwardOutputInput.dim(0));
1775  const T* forwardOutput = forwardOutputInput.template data<T>();
1776 
1777  TIndex dataToReduceSize = dataInput.dim(0);
1778 
1779  const T* segmentGrads = segmentGradsInput.template data<T>();
1780 
1781  vector<TIndex> shape;
1782  shape.push_back(dataToReduceSize);
1783  ctx.appendGradShape(&shape);
1784  dataGradsOutput->Resize(shape);
1785 
1786  TIndex dataGradsBlockSize = dataGradsOutput->size_from_dim(1);
1787  TIndex segmentBlockSize = segmentGradsInput.size_from_dim(1);
1788  T* dataGrads = dataGradsOutput->template mutable_data<T>();
1789 
1790  const T* data = dataInput.template data<T>();
1791 
1792  TIndex dataIndex = 0;
1793  for (TIndex rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) {
1794  ReducerGradient reducer(
1795  ctx, segmentGrads + segmentBlockSize * rangeIndex, &context_);
1796  for (TIndex start = dataIndex; dataIndex < start + lengths[rangeIndex];
1797  ++dataIndex) {
1798  // No range checking, should've been verified in forward pass
1799  reducer.template fillGradWithMainInputAndForwardOutput<FixedSize>(
1800  ctx,
1801  data + dataGradsBlockSize * dataIndex,
1802  dataGrads + dataGradsBlockSize * dataIndex,
1803  forwardOutput + segmentBlockSize * rangeIndex,
1804  dataIndex,
1805  &context_,
1806  lengths[rangeIndex]);
1807  }
1808  }
1809  return true;
1810  }
1811 
1812  // Input layout:
1813  // orig_arg1, orig_arg2, ..., orig_argN, FORWARD_OUTPUT, SEGMENT_GRADS,
1814  // LENGTHS, DATA_INPUT
1815  // orig_argXs represent original op's inputs and will be passed to the reducer
1816  // directly
1817  static constexpr int kNumInputs =
1818  ReducerGradient::originalInputs().size() + 4;
1819  enum _InputTags {
1820  FORWARD_OUTPUT = ReducerGradient::originalInputs().size(),
1821  SEGMENT_GRADS,
1822  LENGTHS,
1823  DATA_INPUT,
1824  };
1825 };
1826 
1827 // base implementation of sparse/non-sparse gradient computation
1828 template <
1829  typename ForwardOp,
1830  typename ReducerDef,
1831  typename ReducerGradient,
1832  bool SparseFused,
1833  bool GradientNeedIndices = false>
1835  using GradientMakerBase::GradientMakerBase;
1836  vector<OperatorDef> GetGradientDefs() override {
1837  vector<string> grad_ins;
1838  string suffix = "Gradient";
1839  for (const int i : ReducerGradient::originalInputs()) {
1840  grad_ins.push_back(I(i));
1841  }
1842  if (ReducerGradient::requiresForwardOutput()) {
1843  grad_ins.push_back(O(0));
1844  CAFFE_ENFORCE(
1845  !SparseFused,
1846  "Forward pass output not yet supported as input for backward pass "
1847  "for SparseLengthsXXX operators");
1848  suffix = "AndForwardOutput" + suffix;
1849  }
1850  grad_ins.push_back(GO(0));
1851  grad_ins.push_back(I(ForwardOp::LENGTHS));
1852  bool indices_pushed = false;
1853  if (ReducerGradient::requiresDataInput(Def())) {
1854  grad_ins.push_back(I(0));
1855  if (SparseFused) {
1856  grad_ins.push_back(I(ForwardOp::INDICES));
1857  indices_pushed = true;
1858  }
1859  suffix = "WithMainInput" + suffix;
1860  }
1861  if (GradientNeedIndices && !indices_pushed) {
1862  if (SparseFused) {
1863  grad_ins.push_back(I(ForwardOp::INDICES));
1864  } else {
1865  // Hacky: using Input as Indices, remove this after we have specialized
1866  // cuda LengthsIndicesInGradientSumGradient
1867  grad_ins.push_back(I(0));
1868  }
1869  }
1870  vector<string> grad_outs;
1871  grad_outs.push_back({SparseFused ? GI_V(0) : GI(0)});
1872  int aux_grads = ReducerGradient::numAuxInputsWithGrads(Def());
1873  for (int i = 1; i <= aux_grads; ++i) {
1874  grad_outs.push_back(GI(i));
1875  }
1876  vector<OperatorDef> r{CreateOperatorDef(
1877  string(SparseFused ? "SparseLengths" : "Lengths") +
1878  string(GradientNeedIndices ? "IndicesInGradient" : "") +
1879  ReducerDef::name + suffix,
1880  "",
1881  grad_ins,
1882  grad_outs)};
1883  if (SparseFused) {
1884  SetSparse(0, I(ForwardOp::INDICES), GI_V(0));
1885  }
1886  return r;
1887  }
1888 };
1889 
1890 template <
1891  typename T,
1892  typename SIndex,
1893  typename Context,
1894  typename ReducerDef,
1895  bool GradientNeedIndices = false>
1897  using OpDef = ReducerDef;
1898  static constexpr const char* basename = "Lengths";
1899  static constexpr const char* doc = R"DOC(
1900 Applies '{op}' to each segment of the input tensor. Segments are defined
1901 by their LENGTHS.
1902 
1903 LENGTHS is a vector that maps each of the first dimension slices of the
1904 DATA to a particular group (segment). Values belonging to the same segment are
1905 aggregated together.
1906 
1907 For example LENGTHS = [2, 1] stands for segments DATA[0..1] and DATA[2]
1908 
1909 The first dimension of the output is equal to the number of input segments,
1910 i.e. `len(LENGTHS)`. Other dimensions are inherited from the input tensor.
1911 
1912 {op_doc}
1913  )DOC";
1914  static void PopulateSchema(OpSchema& schema) {
1915  schema.Input(0, "DATA", "Input tensor, slices of which are aggregated.");
1916  schema.Input(
1917  Reducer::kInputCount,
1918  "LENGTHS",
1919  "Vector with the same sum of elements as the first dimension of DATA");
1920  schema.Output(
1921  0,
1922  "OUTPUT",
1923  "Aggregated output tensor. Has the first dimension of len(LENGTHS) ");
1924  schema.TensorInferenceFunction(
1925  [](const OperatorDef& def, const vector<TensorShape>& in) {
1926  vector<TensorShape> out(0);
1927  TensorShape output;
1928  for (int d : in[Reducer::kInputCount].dims()) {
1929  output.add_dims(d);
1930  }
1931  for (int j = 1; j < in[0].dims_size(); j++) {
1932  output.add_dims(in[0].dims(j));
1933  }
1934  output.set_data_type(in[0].data_type());
1935  out.push_back(output);
1936  return out;
1937  });
1938  ReducerDef::PopulateSchema(schema);
1939  }
1940  using Reducer = typename ReducerDef::template Reducer<T, Context>;
1941  using ReducerGradient =
1942  typename ReducerDef::template ReducerGradient<T, Context>;
1944  using BackwardOp =
1947  T,
1948  SIndex,
1949  Context,
1950  ReducerGradient,
1951  false>;
1954  T,
1955  SIndex,
1956  Context,
1957  ReducerGradient>;
1959  ForwardOp,
1960  ReducerDef,
1961  ReducerGradient,
1962  false /*SparseFused*/,
1963  GradientNeedIndices>;
1964 };
1965 
1966 template <
1967  typename T,
1968  typename SIndex,
1969  typename Context,
1970  typename ReducerDef,
1971  bool GradientNeedIndices = false>
1973  using OpDef = ReducerDef;
1974  static constexpr const char* basename = "SparseLengths";
1975  static constexpr const char* doc = R"DOC(
1976 Pulls in slices of the input tensor, groups them into segments and applies
1977 '{op}' to each segment. Segments are defined by their LENGTHS.
1978 
1979 This op is basically Gather and Lengths{op} fused together.
1980 
1981 INDICES should contain integers in range 0..N-1 where N is the first dimension
1982 of DATA. INDICES represent which slices of DATA need to be pulled in.
1983 
1984 LENGTHS is a vector that defines slice sizes by first dimention of DATA. Values
1985 belonging to the same segment are aggregated together. sum(LENGTHS) has
1986 to match INDICES size.
1987 
1988 The first dimension of the output is equal to the number of input segment,
1989 i.e. `len(LENGTHS)`. Other dimensions are inherited from the input tensor.
1990 
1991 {op_doc}
1992  )DOC";
1993  static void PopulateSchema(OpSchema& schema) {
1994  schema.Input(0, "DATA", "Input tensor, slices of which are aggregated.");
1995  schema.Input(
1996  Reducer::kInputCount,
1997  "INDICES",
1998  "Integer vector containing indices of the first dimension of DATA for "
1999  "the slices that are being aggregated");
2000  schema.Input(
2001  Reducer::kInputCount + 1,
2002  "LENGTHS",
2003  "Non negative vector with sum of elements equal to INDICES length");
2004  schema.Output(
2005  0,
2006  "OUTPUT",
2007  "Aggregated output tensor. Has the first dimension of K "
2008  "(the number of segments).");
2009  ReducerDef::PopulateSchema(schema);
2010  }
2011  using Reducer = typename ReducerDef::template Reducer<T, Context>;
2012  using ReducerGradient =
2013  typename ReducerDef::template ReducerGradient<T, Context>;
2015  // TODO(dzhulgakov): we're registering the same class twice here,
2016  // consider avoiding op duplication here
2017  // Note: registering 2 input version for now because of naming in the macro,
2018  // will register 3 input version alone
2019  /* INDICES are not used in CPU version, but they are needed in async CUDA
2020  * version. So we register 3 input version for CPU as gradient op for
2021  * GPU/CPU convert. We then register 2 input version for CPU for backward
2022  * compatibility with older nets.
2023  */
2025  T,
2026  SIndex,
2027  Context,
2028  ReducerGradient,
2029  false /*GradientNeedIndices*/>;
2031  T,
2032  SIndex,
2033  Context,
2034  ReducerGradient>;
2035  // Will return 3 input version. This is aliging new CPU/GPU nets.
2037  ForwardOp,
2038  ReducerDef,
2039  ReducerGradient,
2040  true /*SparseFused*/,
2041  GradientNeedIndices>;
2042 };
2043 } // namespace caffe2
2044 
2045 #endif // CAFFE2_OPERATORS_SEGMENT_REDUCTION_OP_H_
A class to record the schema of an op.
Segment reduction op with optional fused embedding lookup.
A helper class to index into arguments.
Definition: proto_utils.h:198
Segment reduction op with optional fused embedding lookup.
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:47
Base implementation for segment reduction op that leverages continuity of the data.
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
const void * raw_data() const
Returns a const raw void* pointer of the underlying storage.
Definition: tensor.h:472
Simple non-segmented reduction over the first few dimensions of the tensor.
OpSchema & TensorInferenceFunction(TensorInferenceFunctionType function)
Sets the tensor inference function, which is a std::function object defined in operator_schema.h.
Unsorted segment reduction op with optional fused embedding lookup.