Caffe2 - C++ API: caffe2/operators/segment_reduction

 #ifndef CAFFE2_OPERATORS_SEGMENT_REDUCTION_OP_H_
 #define CAFFE2_OPERATORS_SEGMENT_REDUCTION_OP_H_
 
 #include "caffe2/core/context.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/operators/reducer_functors.h"
 
 namespace caffe2 {
 
 template <typename TData>
 class BaseInputAccessor {
  public:
   BaseInputAccessor() {}
 
   bool observeInput(const Tensor<CPUContext>& dataInput) {
     data_ = dataInput.raw_data();
     return dataInput.template IsType<TData>();
   }
 
   inline const TData*
   getBlockPtr(TIndex in_block_size, TIndex idx, TIndex /* blocks */ = 1) {
     return static_cast<const TData*>(data_) + in_block_size * idx;
   }
 
  protected:
   const void* data_ = nullptr;
 };
 
 // Range reducer ops: leverage that input segment is continuous and allow
 // reducer functors to do something special
 // Note: for now there are no real use cases for it yet :)
 // Also, doesn't support additional arguments for now
 
 template <
     typename T,
     typename SIndex,
     class Context,
     class RangeReducer,
     class InputAccessor = BaseInputAccessor<T>>
 class AbstractSortedSegmentRangeOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(AbstractSortedSegmentRangeOp);
 
   bool RunOnDevice() override {
     auto& dataInput = Input(DATA);
     auto& segment_ids = Input(SEGMENT_IDS);
     auto* output = Output(0);
 
     CAFFE_ENFORCE_EQ(1, segment_ids.ndim(), "SEGMENT_IDS must be a vector");
     auto N = segment_ids.dim(0);
     CAFFE_ENFORCE_EQ(
         N,
         dataInput.dim(0),
         "SEGMENT_IDS must have the same length as outer dimension of DATA");
 
     OPERATOR_NEEDS_FEATURE(
         inputAccessor_.observeInput(dataInput),
         "Unsupported input type: ",
         dataInput.meta().name(),
         ".");
 
     const SIndex* s_ids = segment_ids.template data<SIndex>();
 
     const SIndex K = N > 0 ? s_ids[N - 1] + 1 : 0;
     auto shape = dataInput.dims();
     shape[0] = K;
     output->Resize(shape);
 
     T* out = output->template mutable_data<T>();
 
     if (N == 0) {
       return true;
     }
 
     TIndex block_size = dataInput.size() / N;
 
     // Assume the segments are sorted and there are no gaps
     CAFFE_ENFORCE_EQ(0, s_ids[0], "Indices must be sorted and not have gaps");
     for (TIndex i = 0; i < N;) {
       TIndex start = i;
       for (++i; i < N && s_ids[start] == s_ids[i]; ++i)
         ;
 
       RangeReducer()(
           block_size,
           i - start,
           inputAccessor_.getBlockPtr(block_size, start, i - start),
           out + block_size * s_ids[start],
           &context_);
 
       // check correctness of the next segment
       if (i < N) {
         CAFFE_ENFORCE_EQ(
             s_ids[start] + 1,
             s_ids[i],
             "Indices must be sorted and not have gaps");
       }
     }
     return true;
   }
 
   static constexpr int kNumInputs = 2;
   INPUT_TAGS(DATA, SEGMENT_IDS);
 
  private:
   InputAccessor inputAccessor_;
 };
 
 template <
     typename T,
     typename SIndex,
     class Context,
     class RangeReducerGradient>
 class AbstractSortedSegmentRangeGradientOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(AbstractSortedSegmentRangeGradientOp);
 
   bool RunOnDevice() override {
     // TODO(azzolini): avoid using input/output if not used by a particular op
     auto& data_in = Input(DATA_IN);
     auto& data_out = Input(DATA_OUT);
     auto& segment_grads = Input(SEGMENT_GRADS);
     auto& segment_ids = Input(SEGMENT_IDS);
     auto* data_grads = Output(0);
 
     CAFFE_ENFORCE_EQ(1, segment_ids.ndim(), "SEGMENT_IDS must be a vector");
     TIndex N = segment_ids.dim(0);
 
     const SIndex* s_ids = segment_ids.template data<SIndex>();
     const T* s_grads = segment_grads.template data<T>();
     const T* d_in = data_in.template data<T>();
     const T* d_out = data_out.template data<T>();
 
     auto shape = segment_grads.dims();
     shape[0] = N;
     data_grads->Resize(shape);
 
     const SIndex K = segment_grads.dim(0);
     T* out = data_grads->template mutable_data<T>();
 
     if (N == 0) {
       return true;
     }
 
     TIndex block_size = segment_grads.size_from_dim(1);
 
     // Assume the segments are sorted and there are no gaps
     CAFFE_ENFORCE_EQ(0, s_ids[0], "Indices must be sorted and not have gaps");
     // repeat the check from forward op
     CAFFE_ENFORCE_EQ(
         K - 1, s_ids[N - 1], "Indices must be sorted and not have gaps");
     for (TIndex i = 0; i < N;) {
       TIndex start = i;
       for (++i; i < N && s_ids[start] == s_ids[i]; ++i)
         ;
 
       auto expanded_idx = block_size * start;
       auto reduced_idx = block_size * s_ids[start];
       RangeReducerGradient()(
           block_size,
           i - start,
           s_grads + reduced_idx,
           out + expanded_idx,
           d_in + expanded_idx,
           d_out + reduced_idx,
           &context_);
 
       // check correctness of the next segment
       if (i < N) {
         CAFFE_ENFORCE_EQ(
             s_ids[start] + 1,
             s_ids[i],
             "Indices must be sorted and not have gaps");
       }
     }
     return true;
   }
 
   static constexpr int kNumInputs = 4;
   INPUT_TAGS(DATA_IN, DATA_OUT, SEGMENT_GRADS, SEGMENT_IDS);
 };
 
 template <typename T, typename SIndex, typename Context, typename ReducerDef>
 struct AbstractSortedSegmentRangeDef {
   using OpDef = ReducerDef;
   static constexpr const char* basename = "SortedSegmentRange";
   static constexpr const char* doc = R"DOC(
 Applies '{op}' to each segment of input tensor. In order to allow for more
 efficient implementation of '{op}', the input segments have to be contiguous
 and non-empty.
 
 SEGMENT_IDS is a vector that maps each of the first dimension slices of the
 DATA to a particular group (segment). Values belonging to the same segment are
 aggregated together.
 
 The first dimension of the output is equal to the number of input segments,
 i.e. `SEGMENT_IDS[-1]+1`. Other dimensions are inherited from the input tensor.
 
 {op_doc}
   )DOC";
   static void PopulateSchema(OpSchema& schema) {
     schema.Input(0, "DATA", "Input tensor to be aggregated");
     schema.Input(
         1,
         "SEGMENT_IDS",
         "Vector with the same length as the first dimension of DATA "
         "and values in the range 0..K-1 and in increasing order that "
         "maps each slice of DATA to one of the segments");
     schema.Output(
         0,
         "OUTPUT",
         "Aggregated tensor with the first dimension of K and the "
         "other dimentsions inherited from DATA");
   }
   using ForwardOp = AbstractSortedSegmentRangeOp<
       T,
       SIndex,
       Context,
       typename ReducerDef::template Reducer<T, Context>>;
   using BackwardOp = AbstractSortedSegmentRangeGradientOp<
       T,
       SIndex,
       Context,
       typename ReducerDef::template ReducerGradient<T, Context>>;
   struct GetGradient : public GradientMakerBase {
     using GradientMakerBase::GradientMakerBase;
     vector<OperatorDef> GetGradientDefs() override {
       return SingleGradientDef(
           string(basename) + ReducerDef::name + "Gradient",
           "",
           vector<string>{I(0), O(0), GO(0), I(1)},
           // no gradient on segment_ids!
           vector<string>{GI(0)});
     }
   };
 };
 
 // Incremental reducer ops: assume that reducer consumes pieces of data one by
 // one. Also, supports additional arguments passed to reducer, e.g. scalers for
 // weighted sum.
 //
 // Note: in current implementation additional inputs are considered auxiliary
 // constants and have limitations:
 // - there is no gradient computation for auxiliary inputs
 // - auxiliary inputs aren't affected by fused embedding lookup in operations
 // like sparse_sorted_segment
 
 template <
     typename T,
     class Context,
     class Reducer,
     bool FirstDim,
     class InputAccessor = BaseInputAccessor<T>>
 class AbstractReduceFrontOrBackOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
 
   AbstractReduceFrontOrBackOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<Context>(operator_def, ws),
         OP_SINGLE_ARG(int, "num_reduce_dim", num_reduce_dims_, 1) {}
 
   bool RunOnDevice() override {
     auto& data = Input(0);
     // If more complicated fixed size logic becomes necessary, it can be moved
     // to the reducer class
     TIndex in_block_size = FirstDim
         ? data.size_from_dim(num_reduce_dims_)
         : data.size_to_dim(data.ndim() - num_reduce_dims_);
     return DispatchHelper<typename Reducer::FixedDispatch>::call(
         this, in_block_size);
   }
 
   template <int FixedSize>
   bool DoRunWithValue() {
     auto& data = Input(0);
     auto* output = Output(0);
 
     CAFFE_ENFORCE_LE(num_reduce_dims_, data.ndim());
 
     typename Reducer::Meta ctx(FirstDim);
     ctx.observeInput(0, data, num_reduce_dims_);
     for (int i = 1; i < Reducer::kInputCount; ++i) {
       auto& aux_in = Input(i);
       ctx.observeInput(i, aux_in, num_reduce_dims_);
     }
 
     OPERATOR_NEEDS_FEATURE(
         inputAccessor_.observeInput(data),
         "Unsupported input type: ",
         data.meta().name(),
         ".");
 
     vector<TIndex> shape;
     ctx.appendOutputShape(&shape);
     output->Resize(shape);
 
     T* out = output->template mutable_data<T>();
 
     const int block_size = FirstDim
         ? data.size_from_dim(num_reduce_dims_)
         : data.size_from_dim(data.ndim() - num_reduce_dims_);
 
     const int num_blocks = block_size > 0 ? data.size() / block_size : 0;
 
     Reducer r(ctx, out, &context_);
     for (TIndex i = 0; i < num_blocks; ++i) {
       r.template process<FixedSize>(
           ctx, inputAccessor_.getBlockPtr(block_size, i), i, &context_);
     }
     r.template finish<FixedSize>(ctx, &context_);
     return true;
   }
 
   static constexpr int kNumInputs = Reducer::kInputCount;
 
  private:
   int num_reduce_dims_;
   InputAccessor inputAccessor_;
 };
 
 template <
     typename T,
     class Context,
     class ReducerGradient,
     bool FirstDim = true>
 class AbstractReduceFrontOrBackGradientOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
 
   AbstractReduceFrontOrBackGradientOp(
       const OperatorDef& operator_def,
       Workspace* ws)
       : Operator<Context>(operator_def, ws),
         OP_SINGLE_ARG(int, "num_reduce_dim", num_reduce_dims_, 1) {}
 
   bool RunOnDevice() override {
     // If more complicated fixed size logic becomes necessary, it can be moved
     // to the reducer class
     TIndex grad_block_size = Input(REDUCTION_GRAD).size();
     return DispatchHelper<typename ReducerGradient::FixedDispatch>::call(
         this, grad_block_size);
   }
 
   template <int FixedSize>
   bool DoRunWithValue() {
     auto& reduction_grad = Input(REDUCTION_GRAD);
     auto& source_shape = OperatorBase::Input<TensorCPU>(SOURCE_SHAPE);
 
     auto* data_grads = Output(0);
 
     typename ReducerGradient::Meta ctx(reduction_grad, 0, FirstDim);
     for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) {
       auto& aux_in = Input(i);
       ctx.observeOriginalInput(
           ReducerGradient::originalInputs()[i],
           aux_in,
           nullptr, /*no grad*/
           num_reduce_dims_);
     }
 
     const T* r_grad = reduction_grad.template data<T>();
 
     CAFFE_ENFORCE_LE(num_reduce_dims_, source_shape.size());
 
     vector<TIndex> shape(
         source_shape.template data<TIndex>(),
         source_shape.template data<TIndex>() + source_shape.size());
 
     data_grads->Resize(shape);
 
     TIndex block_size = FirstDim
         ? data_grads->size_from_dim(num_reduce_dims_)
         : data_grads->size_from_dim(data_grads->ndim() - num_reduce_dims_);
     TIndex block_num = block_size > 0 ? data_grads->size() / block_size : 0;
 
     T* out = data_grads->template mutable_data<T>();
 
     ReducerGradient r(ctx, r_grad, &context_);
     for (TIndex i = 0; i < block_num; ++i) {
       r.template fillGrad<FixedSize>(
           ctx,
           out + block_size * i,
           i,
           &context_,
           FirstDim ? block_num : block_size);
     }
     return true;
   }
 
   static constexpr int kNumInputs =
       ReducerGradient::originalInputs().size() + 2;
   enum _InputTags {
     REDUCTION_GRAD = ReducerGradient::originalInputs().size(),
     SOURCE_SHAPE
   };
 
  private:
   int num_reduce_dims_;
 };
 
 template <typename T, typename Context, typename ReducerDef>
 struct AbstractReduceFrontDef {
   using OpDef = ReducerDef;
   static constexpr const char* basename = "ReduceFront";
   static constexpr const char* doc = R"DOC(
 Reduces the input tensor along the first dimension of the input tensor by
 applying '{op}'. This op acts in a similar way to SortedSegment{op} and
 UnsortedSegment{op} but as if all input slices belong to a single segment.
 
 {op_doc}
   )DOC";
   static void PopulateSchema(OpSchema& schema) {
     schema.Input(
         0, "DATA", "Input tensor to be reduced on the first dimension");
     schema.TensorInferenceFunction([](const OperatorDef& def,
                                       const vector<TensorShape>& in) {
       CAFFE_ENFORCE_EQ(1, in.size());
       ArgumentHelper helper(def);
       int num_reduce_dims = helper.GetSingleArgument<int>("num_reduce_dim", 1);
       typename ReducerDef::template Reducer<T, Context>::Meta ctx(true);
       vector<TIndex> out_dims = ctx.getOutputShape(in[0], num_reduce_dims);
       return vector<TensorShape>{
           CreateTensorShape(out_dims, in[0].data_type())};
     });
     ReducerDef::PopulateSchema(schema);
   }
   using ReducerGradient =
       typename ReducerDef::template ReducerGradient<T, Context>;
   using ForwardOp = AbstractReduceFrontOrBackOp<
       T,
       Context,
       typename ReducerDef::template Reducer<T, Context>,
       true>;
   using BackwardOp =
       AbstractReduceFrontOrBackGradientOp<T, Context, ReducerGradient, true>;
   struct GetGradient : public GradientMakerBase {
     using GradientMakerBase::GradientMakerBase;
     vector<OperatorDef> GetGradientDefs() override {
       // Have utility function generating these names?
       string tmp_dims = "_" + O(0) + "_dims";
 
       vector<string> grad_ins;
       for (const int i : ReducerGradient::originalInputs()) {
         grad_ins.push_back(I(i));
       }
       grad_ins.push_back(GO(0));
       grad_ins.push_back(tmp_dims);
 
       vector<Argument> args;
       if (ArgumentHelper::HasArgument(def_, "num_reduce_dim")) {
         args.push_back(GetArgument(def_, "num_reduce_dim"));
       }
       // FIXME: pass in num_reduce_dims?!
       return vector<OperatorDef>{
           CreateOperatorDef(
               "Shape", "", vector<string>{I(0)}, vector<string>{tmp_dims}),
           CreateOperatorDef(
               string(basename) + ReducerDef::name + "Gradient",
               "",
               grad_ins,
               // no gradient on auxiliary inputs for now
               vector<string>{GI(0)}),
       };
     }
   };
 };
 
 template <typename T, typename Context, typename ReducerDef>
 struct AbstractReduceBackDef {
   using OpDef = ReducerDef;
   static constexpr const char* basename = "ReduceBack";
   static constexpr const char* doc = R"DOC(
 Reduces the input tensor along the last dimension of the input tensor by
 applying '{op}'. This op acts in a similar way to SortedSegment{op} and
 UnsortedSegment{op} but as if all input slices belong to a single segment.
 
 {op_doc}
   )DOC";
   static void PopulateSchema(OpSchema& schema) {
     schema.Input(
         0, "DATA", "Input tensor to be reduced on the first dimension");
     schema.TensorInferenceFunction([](const OperatorDef& def,
                                       const vector<TensorShape>& in) {
       CAFFE_ENFORCE_EQ(1, in.size());
       ArgumentHelper helper(def);
       int num_reduce_dims = helper.GetSingleArgument<int>("num_reduce_dim", 1);
       typename ReducerDef::template Reducer<T, Context>::Meta ctx(false);
       vector<TIndex> out_dims = ctx.getOutputShape(in[0], num_reduce_dims);
       return vector<TensorShape>{
           CreateTensorShape(out_dims, in[0].data_type())};
     });
     ReducerDef::PopulateSchema(schema);
   }
   using ReducerGradient =
       typename ReducerDef::template ReducerGradient<T, Context>;
   using ForwardOp = AbstractReduceFrontOrBackOp<
       T,
       Context,
       typename ReducerDef::template Reducer<T, Context>,
       false>;
   using BackwardOp =
       AbstractReduceFrontOrBackGradientOp<T, Context, ReducerGradient, false>;
   struct GetGradient : public GradientMakerBase {
     using GradientMakerBase::GradientMakerBase;
     vector<OperatorDef> GetGradientDefs() override {
       // Have utility function generating these names?
       string tmp_dims = "_" + O(0) + "_dims";
 
       vector<string> grad_ins;
       for (const int i : ReducerGradient::originalInputs()) {
         grad_ins.push_back(I(i));
       }
       grad_ins.push_back(GO(0));
       grad_ins.push_back(tmp_dims);
 
       vector<Argument> args;
       if (ArgumentHelper::HasArgument(def_, "num_reduce_dim")) {
         args.push_back(GetArgument(def_, "num_reduce_dim"));
       }
       // FIXME: pass in num_reduce_dims?!
       return vector<OperatorDef>{
           CreateOperatorDef(
               "Shape", "", vector<string>{I(0)}, vector<string>{tmp_dims}),
           CreateOperatorDef(
               string(basename) + ReducerDef::name + "Gradient",
               "",
               grad_ins,
               // no gradient on auxiliary inputs for now
               vector<string>{GI(0)}),
       };
     }
   };
 };
 
 template <
     typename T,
     typename SIndex,
     class Context,
     class Reducer,
     bool SparseFused = true,
     class InputAccessor = BaseInputAccessor<T>>
 class AbstractSortedSegmentOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(AbstractSortedSegmentOp);
 
   bool RunOnDevice() override {
     if (SparseFused) {
       return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
           this, Input(INDICES));
     } else {
       // type doesn't matter
       return DoRunWithType<TIndex>();
     }
   }
 
   template <typename IndexType>
   bool DoRunWithType() {
     // If more complicated fixed size logic becomes necessary, it can be moved
     // to the reducer class
     TIndex in_block_size = Input(0).size_from_dim(1);
     return DispatchHelper<typename Reducer::FixedDispatch, IndexType>::call(
         this, in_block_size);
   }
 
   template <typename IndexType, int FixedSize>
   bool DoRunWithValue() {
     auto& dataInput = Input(0);
     auto& segment_ids = Input(SEGMENT_IDS);
     auto* output = Output(0);
 
     CAFFE_ENFORCE_EQ(1, segment_ids.ndim(), "SEGMENT_IDS must be a vector");
     TIndex N = segment_ids.dim(0);
     const TIndex M = dataInput.dim(0);
 
     const IndexType* idxs;
     if (SparseFused) { // static if
       auto& indices = Input(INDICES);
       CAFFE_ENFORCE_EQ(1, indices.ndim(), "INDICES must be a vector");
       CAFFE_ENFORCE_EQ(
           N,
           indices.dim(0),
           "SEGMENT_IDS must have the same length as INDICES");
       idxs = indices.template data<IndexType>();
     } else {
       CAFFE_ENFORCE_EQ(
           N, M, "DATA must have the same first dimension as SEGMENT_IDS");
     }
 
     // It would probably look nicer with varargs templates but it's too much
     // metaprogramming
     typename Reducer::Meta ctx;
     ctx.observeInput(0, dataInput, 1);
     for (int i = 1; i < Reducer::kInputCount; ++i) {
       auto& aux_in = Input(i);
       CAFFE_ENFORCE_EQ(
           N,
           aux_in.dim(0),
           "Input ",
           i,
           " must have the same first dim as SEGMENT_IDS");
       ctx.observeInput(i, aux_in, 1);
     }
 
     OPERATOR_NEEDS_FEATURE(
         inputAccessor_.observeInput(dataInput),
         "Unsupported input type: ",
         dataInput.meta().name(),
         ".");
 
     const SIndex* s_ids = segment_ids.template data<SIndex>();
 
     const SIndex K = N > 0 ? s_ids[N - 1] + 1 : 0;
     vector<TIndex> shape;
     shape.push_back(K);
     ctx.appendOutputShape(&shape);
     output->Resize(shape);
 
     T* out = output->template mutable_data<T>();
     if (N == 0) {
       return true;
     }
     TIndex in_block_size = dataInput.size_from_dim(1);
     TIndex out_block_size = output->size_from_dim(1);
 
     // Assume the segments are sorted and there are no gaps
     CAFFE_ENFORCE_EQ(0, s_ids[0], "Indices must be sorted and not have gaps");
     for (TIndex i = 0; i < N;) {
       TIndex start = i;
 
       Reducer r(ctx, out + out_block_size * s_ids[start], &context_);
       for (; i < N && s_ids[start] == s_ids[i]; ++i) {
         IndexType idx;
         if (SparseFused) { // static if
           CAFFE_ENFORCE(
               0 <= idxs[i] && idxs[i] < M,
               "Index out of bounds: ",
               idxs[i],
               ", range 0 to ",
               M);
           idx = idxs[i];
         } else {
           idx = i;
         }
         r.template process<FixedSize>(
             ctx, inputAccessor_.getBlockPtr(in_block_size, idx), i, &context_);
       }
 
       r.template finish<FixedSize>(ctx, &context_);
       // check correctness of the next segment
       if (i < N) {
         CAFFE_ENFORCE_EQ(
             s_ids[start] + 1,
             s_ids[i],
             "Indices must be sorted and not have gaps");
       }
     }
     return true;
   }
 
   enum {
     INDICES = Reducer::kInputCount,
     SEGMENT_IDS = Reducer::kInputCount + (SparseFused ? 1 : 0)
   };
   static constexpr int kSelfInputs = SparseFused ? 2 : 1;
   static constexpr int kNumInputs = Reducer::kInputCount + kSelfInputs;
 
  private:
   InputAccessor inputAccessor_;
 };
 
 // Gradient actually doesn't depend on whether sparse lookup is fused or not
 template <typename T, typename SIndex, class Context, class ReducerGradient>
 class AbstractSortedSegmentGradientOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(AbstractSortedSegmentGradientOp);
 
   bool RunOnDevice() override {
     // If more complicated fixed size logic becomes necessary, it can be moved
     // to the reducer class
     TIndex grad_block_size = Input(SEGMENT_GRADS).size_from_dim(1);
     return DispatchHelper<typename ReducerGradient::FixedDispatch>::call(
         this, grad_block_size);
   }
 
   template <int FixedSize>
   bool DoRunWithValue() {
     auto& segment_grads = Input(SEGMENT_GRADS);
     auto& segment_ids = Input(SEGMENT_IDS);
     auto* data_grads = Output(0);
 
     CAFFE_ENFORCE_EQ(1, segment_ids.ndim(), "SEGMENT_IDS must be a vector");
     TIndex N = segment_ids.dim(0);
 
     typename ReducerGradient::Meta ctx(segment_grads, 1);
     for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) {
       auto& aux_in = Input(i);
       CAFFE_ENFORCE_EQ(
           N,
           aux_in.dim(0),
           "Input ",
           i,
           " must have the same first dim as SEGMENT_IDS");
       ctx.observeOriginalInput(
           ReducerGradient::originalInputs()[i], aux_in, nullptr /*no grad*/, 1);
     }
 
     const SIndex* s_ids = segment_ids.template data<SIndex>();
     const T* s_grads = segment_grads.template data<T>();
 
     vector<TIndex> shape;
     shape.push_back(N);
     ctx.appendGradShape(&shape);
     data_grads->Resize(shape);
 
     TIndex d_block_size = data_grads->size_from_dim(1);
     const SIndex K = segment_grads.dim(0);
     TIndex s_block_size = segment_grads.size_from_dim(1);
     T* out = data_grads->template mutable_data<T>();
 
     if (N == 0) {
       return true;
     }
 
     // Assume the segments are sorted and there are no gaps
     CAFFE_ENFORCE_EQ(0, s_ids[0], "Indices must be sorted and not have gaps");
     // repeat the check from forward op
     CAFFE_ENFORCE_EQ(
         K - 1, s_ids[N - 1], "Indices must be sorted and not have gaps");
     for (TIndex i = 0; i < N;) {
       TIndex start = i;
       TIndex end = start;
 
       if (ReducerGradient::computeLength()) {
         for (; end < N && s_ids[start] == s_ids[end]; ++end) {
         }
       }
 
       ReducerGradient r(ctx, s_grads + s_block_size * s_ids[start], &context_);
       for (; i < N && s_ids[start] == s_ids[i]; ++i) {
         r.template fillGrad<FixedSize>(
             ctx, out + d_block_size * i, i, &context_, end - start);
       }
 
       // check correctness of the next segment
       if (i < N) {
         CAFFE_ENFORCE_EQ(
             s_ids[start] + 1,
             s_ids[i],
             "Indices must be sorted and not have gaps");
       }
     }
     return true;
   }
 
   // Input layout:
   //   orig_arg1, orig_arg2, ..., orig_argN, SEGMENT_GRADS, SEGMENT_IDS
   // orig_argXs represent original op's inputs and will be passed to the reducer
   // directly
   static constexpr int kNumInputs =
       ReducerGradient::originalInputs().size() + 2;
   enum _InputTags {
     SEGMENT_GRADS = ReducerGradient::originalInputs().size(),
     SEGMENT_IDS
   };
 };
 
 // base implementation of sorted/unsorted sparse/non-sparse gradient computation
 template <
     typename ForwardOp,
     typename ReducerDef,
     typename ReducerGradient,
     bool Sorted,
     bool SparseFused>
 struct SegmentOpGetGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
   vector<OperatorDef> GetGradientDefs() override {
     CAFFE_ENFORCE(
         !ReducerGradient::requiresDataInput(Def()),
         "grads on aux inputs are not yet implemented for Segment operators.");
     vector<string> grad_ins;
     for (const int i : ReducerGradient::originalInputs()) {
       grad_ins.push_back(I(i));
     }
     grad_ins.push_back(GO(0));
     grad_ins.push_back(I(ForwardOp::SEGMENT_IDS));
     vector<OperatorDef> r{CreateOperatorDef(
         string(Sorted ? "SortedSegment" : "UnsortedSegment") +
             ReducerDef::name + "Gradient",
         "",
         grad_ins,
         // no gradient on segment_ids or auxiliary inputs for now
         vector<string>{SparseFused ? GI_V(0) : GI(0)})};
     if (SparseFused) {
       SetSparse(0, I(ForwardOp::INDICES), GI_V(0));
     }
     return r;
   }
 };
 
 template <typename T, typename SIndex, typename Context, typename ReducerDef>
 struct AbstractSortedSegmentDef {
   using OpDef = ReducerDef;
   static constexpr const char* basename = "SortedSegment";
   static constexpr const char* doc = R"DOC(
 Applies '{op}' to each segment of input tensor. Segments need to be sorted and
 contiguous. See also UnsortedSegment{op} that doesn't have this requirement.
 
 SEGMENT_IDS is a vector that maps each of the first dimension slices of the
 DATA to a particular group (segment). Values belonging to the same segment are
 aggregated together.
 
 The first dimension of the output is equal to the number of input segments,
 i.e. `SEGMENT_IDS[-1]+1`. Other dimensions are inherited from the input tensor.
 
 {op_doc}
   )DOC";
   static void PopulateSchema(OpSchema& schema) {
     schema.Input(0, "DATA", "Input tensor, slices of which are aggregated.");
     schema.Input(
         Reducer::kInputCount,
         "SEGMENT_IDS",
         "Vector with the same length as the first dimension of DATA "
         "and values in the range 0..K-1 and in increasing order that "
         "maps each slice of DATA to one of the segments");
     schema.Output(
         0,
         "OUTPUT",
         "Aggregated output tensor. Has the first dimension of K "
         "(the number of segments).");
     ReducerDef::PopulateSchema(schema);
   }
   using Reducer = typename ReducerDef::template Reducer<T, Context>;
   using ReducerGradient =
       typename ReducerDef::template ReducerGradient<T, Context>;
   using ForwardOp = AbstractSortedSegmentOp<T, SIndex, Context, Reducer, false>;
   using BackwardOp =
       AbstractSortedSegmentGradientOp<T, SIndex, Context, ReducerGradient>;
   using GetGradient = SegmentOpGetGradient<
       ForwardOp,
       ReducerDef,
       ReducerGradient,
       true /*Sorted*/,
       false /*SparseFused*/>;
 };
 
 template <typename T, typename SIndex, typename Context, typename ReducerDef>
 struct AbstractSparseSortedSegmentDef {
   using OpDef = ReducerDef;
   static constexpr const char* basename = "SparseSortedSegment";
   static constexpr const char* doc = R"DOC(
 Pulls in slices of the input tensor, groups them into segments and applies
 '{op}' to each segment. Segments need to be sorted and contiguous. See also
 SparseUnsortedSegment{op} that doesn't have this requirement.
 
 This op is basically Gather and SortedSegment{op} fused together.
 
 INDICES should contain integers in range 0..N-1 where N is the first dimension
 of DATA. INDICES represent which slices of DATA need to be pulled in.
 
 SEGMENT_IDS is a vector that maps each referenced slice of the DATA to a
 particular group (segment). Values belonging to the same segment are aggregated
 together. SEGMENT_IDS should have the same dimension as INDICES.
 
 The first dimension of the output is equal to the number of input segments,
 i.e. `SEGMENT_IDS[-1]+1`. Other dimensions are inherited from the input tensor.
 
 {op_doc}
   )DOC";
   static void PopulateSchema(OpSchema& schema) {
     schema.Input(0, "DATA", "Input tensor, slices of which are aggregated.");
     schema.Input(
         Reducer::kInputCount,
         "INDICES",
         "Integer vector containing indices of the first dimension of DATA for "
         "the slices that are being aggregated");
     schema.Input(
         Reducer::kInputCount + 1,
         "SEGMENT_IDS",
         "Vector with the same length as INDICES and values in the range "
         "0..K-1 and in increasing order that maps each slice of DATA referenced"
         " by INDICES to one of the segments");
     schema.Output(
         0,
         "OUTPUT",
         "Aggregated output tensor. Has the first dimension of K "
         "(the number of segments).");
     ReducerDef::PopulateSchema(schema);
   }
   using Reducer = typename ReducerDef::template Reducer<T, Context>;
   using ReducerGradient =
       typename ReducerDef::template ReducerGradient<T, Context>;
   using ForwardOp = AbstractSortedSegmentOp<T, SIndex, Context, Reducer>;
   // TODO(dzhulgakov): we're registering the same class twice here,
   // consider avoiding op duplication here
   using BackwardOp =
       AbstractSortedSegmentGradientOp<T, SIndex, Context, ReducerGradient>;
   using GetGradient = SegmentOpGetGradient<
       ForwardOp,
       ReducerDef,
       ReducerGradient,
       true /*Sorted*/,
       true /*SparseFused*/>;
 };
 
 template <
     typename T,
     typename SIndex,
     class Context,
     class Reducer,
     bool SparseFused = true,
     class InputAccessor = BaseInputAccessor<T>>
 class AbstractUnsortedSegmentOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
 
   AbstractUnsortedSegmentOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<Context>(operator_def, ws),
         OP_SINGLE_ARG(int, "num_segments", num_segments_, -1) {}
 
   bool RunOnDevice() override {
     if (SparseFused) {
       return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
           this, Input(INDICES));
     } else {
       // type doesn't matter
       return DoRunWithType<TIndex>();
     }
   }
 
   template <typename IndexType>
   bool DoRunWithType() {
     // If more complicated fixed size logic becomes necessary, it can be moved
     // to the reducer class
     TIndex in_block_size = Input(0).size_from_dim(1);
     return DispatchHelper<typename Reducer::FixedDispatch, IndexType>::call(
         this, in_block_size);
   }
 
   template <typename IndexType, int FixedSize>
   bool DoRunWithValue() {
     auto& data = Input(0);
     auto& segment_ids = Input(SEGMENT_IDS);
     auto* output = Output(0);
 
     CAFFE_ENFORCE_EQ(1, segment_ids.ndim(), "SEGMENT_IDS must be a vector");
     TIndex N = segment_ids.dim(0);
     const TIndex M = data.dim(0);
 
     const IndexType* idxs;
     if (SparseFused) { // static if
       auto& indices = Input(INDICES);
       CAFFE_ENFORCE_EQ(1, indices.ndim(), "INDICES must be a vector");
       CAFFE_ENFORCE_EQ(
           N,
           indices.dim(0),
           "SEGMENT_IDS must have the same length as INDICES");
       idxs = indices.template data<IndexType>();
     } else {
       CAFFE_ENFORCE_EQ(
           N, M, "DATA must have the same first dimension as SEGMENT_IDS");
     }
 
     // It would probably look nicer with varargs templates but it's too much
     // metaprogramming
     typename Reducer::Meta ctx;
     ctx.observeInput(0, data, 1);
     for (int i = 1; i < Reducer::kInputCount; ++i) {
       auto& aux_in = Input(i);
       CAFFE_ENFORCE_EQ(
           N,
           aux_in.dim(0),
           "Input ",
           i,
           " must have the same first dim as SEGMENT_IDS");
       ctx.observeInput(i, aux_in, 1);
     }
 
     const SIndex* s_ids = segment_ids.template data<SIndex>();
     OPERATOR_NEEDS_FEATURE(
         inputAccessor_.observeInput(data),
         "Unsupported input type: ",
         data.meta().name(),
         ".");
 
     // determine the number of segments
     SIndex K;
     if (num_segments_ != -1) {
       K = num_segments_;
     } else {
       K = 0;
       for (TIndex i = 0; i < N; ++i) {
         K = std::max(K, s_ids[i] + 1);
       }
     }
 
     vector<TIndex> shape;
     shape.push_back(K);
     ctx.appendOutputShape(&shape);
     output->Resize(shape);
 
     TIndex in_block_size = data.size_from_dim(1);
     TIndex out_block_size = output->size_from_dim(1);
     T* out = output->template mutable_data<T>();
 
     reducers_.clear();
     reducers_.reserve(K);
     for (TIndex i = 0; i < K; ++i) {
       reducers_.emplace_back(ctx, out + out_block_size * i, &context_);
     }
 
     for (TIndex i = 0; i < N; ++i) {
       auto s_id = s_ids[i];
       CAFFE_ENFORCE(
           0 <= s_id && s_id < K,
           "Segment id out of range: ",
           s_id,
           ", range 0 to ",
           K);
       IndexType idx;
       if (SparseFused) { // static if
         CAFFE_ENFORCE(
             0 <= idxs[i] && idxs[i] < M,
             "Index out of bounds: ",
             idxs[i],
             ", range 0 to ",
             M);
         idx = idxs[i];
       } else {
         idx = i;
       }
       reducers_[s_id].template process<FixedSize>(
           ctx, inputAccessor_.getBlockPtr(in_block_size, idx), i, &context_);
     }
 
     for (TIndex i = 0; i < K; ++i) {
       reducers_[i].template finish<FixedSize>(ctx, &context_);
     }
     // call reducers destructors (if there is any)
     reducers_.clear();
     return true;
   }
 
   enum {
     INDICES = Reducer::kInputCount,
     SEGMENT_IDS = Reducer::kInputCount + (SparseFused ? 1 : 0)
   };
   static constexpr int kSelfInputs = SparseFused ? 2 : 1;
   static constexpr int kNumInputs = Reducer::kInputCount + kSelfInputs;
 
  private:
   TIndex num_segments_;
   // member field to reuse memory
   vector<Reducer> reducers_;
   InputAccessor inputAccessor_;
 };
 
 // Gradient actually doesn't depend on whether sparse lookup is fused or not
 template <typename T, typename SIndex, class Context, class ReducerGradient>
 class AbstractUnsortedSegmentGradientOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(AbstractUnsortedSegmentGradientOp);
 
   bool RunOnDevice() override {
     // If more complicated fixed size logic becomes necessary, it can be moved
     // to the reducer class
     TIndex grad_block_size = Input(SEGMENT_GRADS).size_from_dim(1);
     return DispatchHelper<typename ReducerGradient::FixedDispatch>::call(
         this, grad_block_size);
   }
 
   template <int FixedSize>
   bool DoRunWithValue() {
     auto& segment_grads = Input(SEGMENT_GRADS);
     auto& segment_ids = Input(SEGMENT_IDS);
     auto* data_grads = Output(0);
 
     CAFFE_ENFORCE_EQ(1, segment_ids.ndim(), "SEGMENT_IDS must be a vector");
     TIndex N = segment_ids.dim(0);
 
     typename ReducerGradient::Meta ctx(segment_grads, 1);
     for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) {
       auto& aux_in = Input(i);
       CAFFE_ENFORCE_EQ(
           N,
           aux_in.dim(0),
           "Input ",
           i,
           " must have the same first dim as SEGMENT_IDS");
       ctx.observeOriginalInput(
           ReducerGradient::originalInputs()[i], aux_in, nullptr /*no grad*/, 1);
     }
 
     const SIndex* s_ids = segment_ids.template data<SIndex>();
     const T* s_grads = segment_grads.template data<T>();
 
     vector<TIndex> shape;
     shape.push_back(N);
     ctx.appendGradShape(&shape);
     data_grads->Resize(shape);
 
     TIndex d_block_size = data_grads->size_from_dim(1);
     const SIndex K = segment_grads.dim(0);
     TIndex s_block_size = segment_grads.size_from_dim(1);
     T* out = data_grads->template mutable_data<T>();
 
     if (ReducerGradient::computeLength()) {
       segment_length_.resize(K, 0);
       for (int i = 0; i < N; ++i) {
         auto s_id = s_ids[i];
         CAFFE_ENFORCE(
             0 <= s_id && s_id < K,
             "Segment id out of range: ",
             s_id,
             ", range 0 to ",
             K);
         segment_length_[s_ids[i]]++;
       }
     }
 
     reducers_.clear();
     reducers_.reserve(K);
     for (SIndex i = 0; i < K; ++i) {
       reducers_.emplace_back(ctx, s_grads + s_block_size * i, &context_);
     }
 
     for (TIndex i = 0; i < N; ++i) {
       auto s_id = s_ids[i];
       if (ReducerGradient::computeLength()) {
         reducers_[s_id].template fillGrad<FixedSize>(
             ctx, out + d_block_size * i, i, &context_, segment_length_[s_id]);
       } else {
         reducers_[s_id].template fillGrad<FixedSize>(
             ctx, out + d_block_size * i, i, &context_, 0);
       }
     }
     // call reducers destructors (if there is any)
     reducers_.clear();
     return true;
   }
 
   // Input layout:
   //   orig_arg1, orig_arg2, ..., orig_argN, SEGMENT_GRADS, SEGMENT_IDS
   // orig_argXs represent original op's inputs and will be passed to the reducer
   // directly
   static constexpr int kNumInputs =
       ReducerGradient::originalInputs().size() + 2;
   enum _InputTags {
     SEGMENT_GRADS = ReducerGradient::originalInputs().size(),
     SEGMENT_IDS
   };
 
  private:
   // member field to reuse memory
   vector<ReducerGradient> reducers_;
   vector<int> segment_length_;
 };
 
 template <typename T, typename SIndex, typename Context, typename ReducerDef>
 struct AbstractUnsortedSegmentDef {
   using OpDef = ReducerDef;
   static constexpr const char* basename = "UnsortedSegment";
   static constexpr const char* doc = R"DOC(
 Applies '{op}' to each segment of input tensor. Segments ids can appear in
 arbitrary order (unlike in SortedSegment{op}).
 
 SEGMENT_IDS is a vector that maps each of the first dimension slices of the
 DATA to a particular group (segment). Values belonging to the same segment are
 aggregated together.
 
 If `num_segments` argument is passed it would be used as a first dimension for
 the output. Otherwise, it'd be dynamically calculated from as the max value of
 SEGMENT_IDS plus one. Other output dimensions are inherited from the input
 tensor.
 
 {op_doc}
   )DOC";
   static void PopulateSchema(OpSchema& schema) {
     schema.Arg(
         "num_segments",
         "Optional int argument specifying the number of output segments and "
         "thus the first dimension of the output");
     schema.Input(0, "DATA", "Input tensor, slices of which are aggregated.");
     schema.Input(
         Reducer::kInputCount,
         "SEGMENT_IDS",
         "Integer vector with the same length as the first dimension of DATA "
         "that maps each slice of DATA to one of the segments");
     schema.Output(
         0,
         "OUTPUT",
         "Aggregated output tensor. Has the first dimension of equal to the "
         "number of segments.");
     ReducerDef::PopulateSchema(schema);
   }
   using Reducer = typename ReducerDef::template Reducer<T, Context>;
   using ReducerGradient =
       typename ReducerDef::template ReducerGradient<T, Context>;
   using ForwardOp = AbstractUnsortedSegmentOp<
       T,
       SIndex,
       Context,
       typename ReducerDef::template Reducer<T, Context>,
       false>;
   using BackwardOp =
       AbstractUnsortedSegmentGradientOp<T, SIndex, Context, ReducerGradient>;
   using GetGradient = SegmentOpGetGradient<
       ForwardOp,
       ReducerDef,
       ReducerGradient,
       false /*Sorted*/,
       false /*SparseFused*/>;
 };
 
 template <typename T, typename SIndex, typename Context, typename ReducerDef>
 struct AbstractSparseUnsortedSegmentDef {
   using OpDef = ReducerDef;
   static constexpr const char* basename = "SparseUnsortedSegment";
   static constexpr const char* doc = R"DOC(
 Pulls in slices of the input tensor, groups them into segments and applies
 '{op}' to each segment. Segments ids can appear in arbitrary order (unlike in
 SparseSortedSegment{op}).
 
 This op is basically Gather and UnsortedSegment{op} fused together.
 
 INDICES should contain integers in range 0..N-1 where N is the first dimension
 of DATA. INDICES represent which slices of DATA need to be pulled in.
 
 SEGMENT_IDS is a vector that maps each referenced slice of the DATA to a
 particular group (segment). Values belonging to the same segment are aggregated
 together. SEGMENT_IDS should have the same dimension as INDICES.
 
 If `num_segments` argument is passed it would be used as a first dimension for
 the output. Otherwise, it'd be dynamically calculated from as the max value of
 SEGMENT_IDS plus one. Other output dimensions are inherited from the input
 tensor.
 
 {op_doc}
   )DOC";
   static void PopulateSchema(OpSchema& schema) {
     schema.Input(0, "DATA", "Input tensor, slices of which are aggregated.");
     schema.Input(
         Reducer::kInputCount,
         "INDICES",
         "Integer vector containing indices of the first dimension of DATA for "
         "the slices that are being aggregated");
     schema.Input(
         Reducer::kInputCount + 1,
         "SEGMENT_IDS",
         "Integer vector with the same length as INDICES that maps each slice "
         "of DATA referenced by INDICES to one of the segments");
     schema.Output(
         0,
         "OUTPUT",
         "Aggregated output tensor. Has the first dimension of equal to the "
         "number of segments.");
     ReducerDef::PopulateSchema(schema);
   }
   using Reducer = typename ReducerDef::template Reducer<T, Context>;
   using ReducerGradient =
       typename ReducerDef::template ReducerGradient<T, Context>;
   using ForwardOp = AbstractUnsortedSegmentOp<T, SIndex, Context, Reducer>;
   // TODO(dzhulgakov): we're registering the same class twice here,
   // consider avoiding op duplication here
   using BackwardOp =
       AbstractUnsortedSegmentGradientOp<T, SIndex, Context, ReducerGradient>;
   using GetGradient = SegmentOpGetGradient<
       ForwardOp,
       ReducerDef,
       ReducerGradient,
       false /*Sorted*/,
       true /*SparseFused*/>;
 };
 
 // TODO(dzhulgakov): for now it's implemented with incremental reducers because
 // of fused sparse support. But using "lengths" representation actually implies
 // continuous segments and thus range reducers can be used for non-sparse
 // version.
 
 template <
     typename TData,
     typename TLengths,
     class Context,
     class Reducer,
     bool SparseFused = true,
     class InputAccessor = BaseInputAccessor<TData>>
 class AbstractLengthsOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(AbstractLengthsOp);
 
   bool RunOnDevice() override {
     if (SparseFused) {
       return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
           this, Input(INDICES));
     } else {
       // type doesn't matter
       return DoRunWithType<TIndex>();
     }
   }
 
   template <typename IndexType>
   bool DoRunWithType() {
     // If more complicated fixed size logic becomes necessary, it can be moved
     // to the reducer class
     TIndex in_block_size = Input(0).size_from_dim(1);
     return DispatchHelper<typename Reducer::FixedDispatch, IndexType>::call(
         this, in_block_size);
   }
 
   template <typename IndexType, int FixedSize>
   bool DoRunWithValue() {
     auto& dataInput = Input(0);
     auto& lengthsInput = Input(LENGTHS);
     auto* output = Output(0);
 
     CAFFE_ENFORCE_EQ(1, lengthsInput.ndim(), "LENGTHS must be a vector");
     const TIndex dataSize = dataInput.dim(0);
     // Either first dim the data or how much we pull in indexies from it
     TIndex dataToReduceSize;
     const TIndex outputSize = lengthsInput.dim(0);
 
     const IndexType* indices;
     if (SparseFused) { // static if
       auto& indicesInput = Input(INDICES);
       CAFFE_ENFORCE_EQ(1, indicesInput.ndim(), "INDICES must be a vector");
       indices = indicesInput.template data<IndexType>();
       dataToReduceSize = indicesInput.dim(0);
     } else {
       dataToReduceSize = dataSize;
     }
 
     typename Reducer::Meta ctx;
     ctx.observeInput(0, dataInput, 1);
     for (int i = 1; i < Reducer::kInputCount; ++i) {
       auto& aux_in = Input(i);
       CAFFE_ENFORCE(
           dataToReduceSize == aux_in.dim(0),
           "Input ",
           i,
           " must have the same first dim as SEGMENT_IDS");
       ctx.observeInput(i, aux_in, 1);
     }
 
     const TLengths* lengths = lengthsInput.template data<TLengths>();
 
     OPERATOR_NEEDS_FEATURE(
         inputAccessor_.observeInput(dataInput),
         "Unsupported input type: ",
         dataInput.meta().name(),
         ".");
 
     vector<TIndex> shape{outputSize};
     ctx.appendOutputShape(&shape);
     output->Resize(shape);
 
     TIndex in_block_size = dataInput.size_from_dim(1);
     TIndex out_block_size = output->size_from_dim(1);
     TData* out = output->template mutable_data<TData>();
 
     TIndex dataIndex = 0;
     for (TIndex rangeIndex = 0; rangeIndex < outputSize; ++rangeIndex) {
       Reducer reducer(ctx, out + out_block_size * rangeIndex, &context_);
       for (TIndex start = dataIndex; dataIndex < start + lengths[rangeIndex];
            ++dataIndex) {
         IndexType idx;
         if (SparseFused) { // static if
           idx = indices[dataIndex];
           CAFFE_ENFORCE(
               0 <= idx && idx < dataSize,
               "The ",
               dataIndex,
               "th index from the input indices is out of bounds: ",
               idx,
               " vs. valid range 0 to ",
               dataSize);
         } else {
           idx = dataIndex;
           CAFFE_ENFORCE(
               0 <= idx && idx < dataSize,
               "When calculating the ",
               rangeIndex,
               "th output with length=",
               lengths[rangeIndex],
               ", the index is out of bounds: ",
               idx,
               " vs. valid range 0 to ",
               dataSize);
         }
 
         const TData* input = inputAccessor_.getBlockPtr(in_block_size, idx);
         reducer.template process<FixedSize>(ctx, input, dataIndex, &context_);
       }
       reducer.template finish<FixedSize>(ctx, &context_);
     }
     CAFFE_ENFORCE(
         dataIndex == dataToReduceSize, dataIndex, " != ", dataToReduceSize);
 
     return true;
   }
 
   enum {
     INDICES = Reducer::kInputCount,
     LENGTHS = Reducer::kInputCount + (SparseFused ? 1 : 0)
   };
   static constexpr int kSelfInputs = SparseFused ? 2 : 1;
   static constexpr int kNumInputs = Reducer::kInputCount + kSelfInputs;
 
  private:
   InputAccessor inputAccessor_;
 };
 
 /*
  * Some notice:
  * 1. Gradient actually doesn't depend on whether sparse lookup is fused or not
  * 2. INDICES are not used in CPU version, but they are needed in async CUDA
  *    version. So we register 3 input version for CPU as gradient op for
  *    GPU/CPU convert. We then register 2 input version for CPU for backward
  *    compatibility with older nets.
  */
 template <
     typename T,
     typename TLengths,
     class Context,
     class ReducerGradient,
     bool GradientNeedIndices = false>
 class AbstractLengthsGradientOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(AbstractLengthsGradientOp);
 
   bool RunOnDevice() override {
     // If more complicated fixed size logic becomes necessary, it can be moved
     // to the reducer class
     TIndex gradBlockSize = Input(SEGMENT_GRADS).size_from_dim(1);
     return DispatchHelper<typename ReducerGradient::FixedDispatch>::call(
         this, gradBlockSize);
   }
 
   template <int FixedSize>
   bool DoRunWithValue() {
     auto& segmentGradsInput = Input(SEGMENT_GRADS);
     auto& lengthsInput = Input(LENGTHS);
     auto* dataGradsOutput = Output(0);
 
     CAFFE_ENFORCE(lengthsInput.ndim() == 1, "LENGTHS must be a vector");
     TIndex reducedDataSize = 0;
     TIndex numSegments = lengthsInput.dim(0);
     CAFFE_ENFORCE(segmentGradsInput.ndim() > 0);
     CAFFE_ENFORCE(numSegments == segmentGradsInput.dim(0));
     const TLengths* lengths = lengthsInput.template data<TLengths>();
     for (TIndex i = 0; i < numSegments; ++i) {
       reducedDataSize += lengths[i];
     }
 
     typename ReducerGradient::Meta ctx(segmentGradsInput, 1);
     for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) {
       auto& aux_in = Input(i);
       CAFFE_ENFORCE_EQ(
           reducedDataSize,
           aux_in.dim(0),
           "Input ",
           i,
           " must have the same first dim as SEGMENT_IDS");
       ctx.observeOriginalInput(
           ReducerGradient::originalInputs()[i], aux_in, nullptr /*no grad*/, 1);
     }
 
     const T* segmentGrads = segmentGradsInput.template data<T>();
 
     vector<TIndex> shape;
     shape.push_back(reducedDataSize);
     ctx.appendGradShape(&shape);
     dataGradsOutput->Resize(shape);
 
     TIndex dataGradsBlockSize = dataGradsOutput->size_from_dim(1);
     TIndex segmentBlockSize = segmentGradsInput.size_from_dim(1);
     T* dataGrads = dataGradsOutput->template mutable_data<T>();
 
     TIndex dataIndex = 0;
     for (TIndex rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) {
       ReducerGradient reducer(
           ctx, segmentGrads + segmentBlockSize * rangeIndex, &context_);
       for (TIndex start = dataIndex; dataIndex < start + lengths[rangeIndex];
            ++dataIndex) {
         reducer.template fillGrad<FixedSize>(
             ctx,
             dataGrads + dataGradsBlockSize * dataIndex,
             dataIndex,
             &context_,
             lengths[rangeIndex]);
       }
     }
     CAFFE_ENFORCE(
         dataIndex == reducedDataSize, dataIndex, " != ", reducedDataSize);
     return true;
   }
 
   // Input layout:
   //   orig_arg1, orig_arg2, ..., orig_argN, SEGMENT_GRADS, LENGTHS, INDICES
   // orig_argXs represent original op's inputs and will be passed to the reducer
   // directly
   static constexpr int kNumInputs = ReducerGradient::originalInputs().size() +
       2 + (GradientNeedIndices ? 1 : 0);
   enum _InputTags {
     SEGMENT_GRADS = ReducerGradient::originalInputs().size(),
     LENGTHS,
     INDICES
   };
 };
 
 // Version of gradient that requires the main input and thus needs to receive
 // length, indices and other stuff
 template <
     typename T,
     typename TLengths,
     class Context,
     class ReducerGradient,
     bool SparseFused = true,
     bool GradientNeedIndices = false>
 class AbstractLengthsWithMainInputGradientOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(AbstractLengthsWithMainInputGradientOp);
 
   bool RunOnDevice() override {
     if (SparseFused) {
       return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
           this, Input(INDICES));
     } else {
       // type doesn't matter
       return DoRunWithType<TIndex>();
     }
   }
 
   template <typename IndexType>
   bool DoRunWithType() {
     // If more complicated fixed size logic becomes necessary, it can be moved
     // to the reducer class
     TIndex in_block_size = Input(SEGMENT_GRADS).size_from_dim(1);
     return DispatchHelper<typename ReducerGradient::FixedDispatch, IndexType>::
         call(this, in_block_size);
   }
 
   template <typename IndexType, int FixedSize>
   bool DoRunWithValue() {
     auto& dataInput = Input(DATA_INPUT);
     auto& segmentGradsInput = Input(SEGMENT_GRADS);
     auto& lengthsInput = Input(LENGTHS);
     auto* dataGradsOutput = Output(0);
 
     CAFFE_ENFORCE(lengthsInput.ndim() == 1, "LENGTHS must be a vector");
     TIndex numSegments = lengthsInput.dim(0);
     CAFFE_ENFORCE(segmentGradsInput.ndim() > 0);
     CAFFE_ENFORCE(numSegments == segmentGradsInput.dim(0));
     const TLengths* lengths = lengthsInput.template data<TLengths>();
 
     typename ReducerGradient::Meta ctx(segmentGradsInput, 1);
     for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) {
       int aux_num = ReducerGradient::originalInputs()[i];
       auto& aux_in = Input(i);
       auto* aux_grad = aux_num < OutputSize() ? Output(aux_num) : nullptr;
       ctx.observeOriginalInput(aux_num, aux_in, aux_grad, 1);
     }
 
     // Either first dim the data or how much we pull in indexies from it
     TIndex dataToReduceSize;
     const IndexType* indices = nullptr;
     if (SparseFused) { // static if
       auto& indicesInput = Input(INDICES);
       indices = indicesInput.template data<IndexType>();
       dataToReduceSize = indicesInput.dim(0);
     } else {
       dataToReduceSize = dataInput.dim(0);
     }
 
     const T* segmentGrads = segmentGradsInput.template data<T>();
 
     vector<TIndex> shape;
     shape.push_back(dataToReduceSize);
     ctx.appendGradShape(&shape);
     dataGradsOutput->Resize(shape);
 
     TIndex dataGradsBlockSize = dataGradsOutput->size_from_dim(1);
     TIndex segmentBlockSize = segmentGradsInput.size_from_dim(1);
     T* dataGrads = dataGradsOutput->template mutable_data<T>();
 
     const T* data = dataInput.template data<T>();
 
     TIndex dataIndex = 0;
     for (TIndex rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) {
       ReducerGradient reducer(
           ctx, segmentGrads + segmentBlockSize * rangeIndex, &context_);
       for (TIndex start = dataIndex; dataIndex < start + lengths[rangeIndex];
            ++dataIndex) {
         IndexType data_pos;
         // No range checking, should've been verified in forward pass
         if (SparseFused) { // static if
           data_pos = indices[dataIndex];
         } else {
           data_pos = dataIndex;
         }
         reducer.template fillGradWithMainInput<FixedSize>(
             ctx,
             data + dataGradsBlockSize * data_pos,
             dataGrads + dataGradsBlockSize * dataIndex,
             dataIndex,
             &context_,
             lengths[rangeIndex]);
       }
     }
     return true;
   }
 
   // Input layout:
   //   orig_arg1, orig_arg2, ..., orig_argN, SEGMENT_GRADS, LENGTHS,
   //      DATA_INPUT, [INDICES]
   // orig_argXs represent original op's inputs and will be passed to the reducer
   // directly
   static constexpr int kNumInputs = ReducerGradient::originalInputs().size() +
       3 + (SparseFused ? 1 : 0) + (GradientNeedIndices ? 1 : 0);
   enum _InputTags {
     SEGMENT_GRADS = ReducerGradient::originalInputs().size(),
     LENGTHS,
     DATA_INPUT,
     INDICES,
   };
 };
 
 // Version of gradient that requires the main input as well as the output of the
 // forward op.
 template <typename T, typename TLengths, class Context, class ReducerGradient>
 class AbstractLengthsWithMainInputAndForwardOutputGradientOp
     : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(AbstractLengthsWithMainInputAndForwardOutputGradientOp);
 
   bool RunOnDevice() override {
     // If more complicated fixed size logic becomes necessary, it can be moved
     // to the reducer class.
     TIndex in_block_size = Input(SEGMENT_GRADS).size_from_dim(1);
     return DispatchHelper<typename ReducerGradient::FixedDispatch>::call(
         this, in_block_size);
   }
 
   template <int FixedSize>
   bool DoRunWithValue() {
     auto& dataInput = Input(DATA_INPUT);
     auto& segmentGradsInput = Input(SEGMENT_GRADS);
     auto& lengthsInput = Input(LENGTHS);
     auto& forwardOutputInput = Input(FORWARD_OUTPUT);
     auto* dataGradsOutput = Output(0);
 
     CAFFE_ENFORCE(lengthsInput.ndim() == 1, "LENGTHS must be a vector");
     TIndex numSegments = lengthsInput.dim(0);
     CAFFE_ENFORCE(segmentGradsInput.ndim() > 0);
     CAFFE_ENFORCE(numSegments == segmentGradsInput.dim(0));
     const TLengths* lengths = lengthsInput.template data<TLengths>();
 
     typename ReducerGradient::Meta ctx(segmentGradsInput, 1);
     for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) {
       int aux_num = ReducerGradient::originalInputs()[i];
       auto& aux_in = Input(i);
       auto* aux_grad = aux_num < OutputSize() ? Output(aux_num) : nullptr;
       ctx.observeOriginalInput(aux_num, aux_in, aux_grad, 1);
     }
 
     CAFFE_ENFORCE(forwardOutputInput.ndim() > 0);
     CAFFE_ENFORCE(numSegments == forwardOutputInput.dim(0));
     const T* forwardOutput = forwardOutputInput.template data<T>();
 
     TIndex dataToReduceSize = dataInput.dim(0);
 
     const T* segmentGrads = segmentGradsInput.template data<T>();
 
     vector<TIndex> shape;
     shape.push_back(dataToReduceSize);
     ctx.appendGradShape(&shape);
     dataGradsOutput->Resize(shape);
 
     TIndex dataGradsBlockSize = dataGradsOutput->size_from_dim(1);
     TIndex segmentBlockSize = segmentGradsInput.size_from_dim(1);
     T* dataGrads = dataGradsOutput->template mutable_data<T>();
 
     const T* data = dataInput.template data<T>();
 
     TIndex dataIndex = 0;
     for (TIndex rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) {
       ReducerGradient reducer(
           ctx, segmentGrads + segmentBlockSize * rangeIndex, &context_);
       for (TIndex start = dataIndex; dataIndex < start + lengths[rangeIndex];
            ++dataIndex) {
         // No range checking, should've been verified in forward pass
         reducer.template fillGradWithMainInputAndForwardOutput<FixedSize>(
             ctx,
             data + dataGradsBlockSize * dataIndex,
             dataGrads + dataGradsBlockSize * dataIndex,
             forwardOutput + segmentBlockSize * rangeIndex,
             dataIndex,
             &context_,
             lengths[rangeIndex]);
       }
     }
     return true;
   }
 
   // Input layout:
   //   orig_arg1, orig_arg2, ..., orig_argN, FORWARD_OUTPUT, SEGMENT_GRADS,
   //      LENGTHS, DATA_INPUT
   // orig_argXs represent original op's inputs and will be passed to the reducer
   // directly
   static constexpr int kNumInputs =
       ReducerGradient::originalInputs().size() + 4;
   enum _InputTags {
     FORWARD_OUTPUT = ReducerGradient::originalInputs().size(),
     SEGMENT_GRADS,
     LENGTHS,
     DATA_INPUT,
   };
 };
 
 // base implementation of sparse/non-sparse gradient computation
 template <
     typename ForwardOp,
     typename ReducerDef,
     typename ReducerGradient,
     bool SparseFused,
     bool GradientNeedIndices = false>
 struct LengthsOpGetGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
   vector<OperatorDef> GetGradientDefs() override {
     vector<string> grad_ins;
     string suffix = "Gradient";
     for (const int i : ReducerGradient::originalInputs()) {
       grad_ins.push_back(I(i));
     }
     if (ReducerGradient::requiresForwardOutput()) {
       grad_ins.push_back(O(0));
       CAFFE_ENFORCE(
           !SparseFused,
           "Forward pass output not yet supported as input for backward pass "
           "for SparseLengthsXXX operators");
       suffix = "AndForwardOutput" + suffix;
     }
     grad_ins.push_back(GO(0));
     grad_ins.push_back(I(ForwardOp::LENGTHS));
     bool indices_pushed = false;
     if (ReducerGradient::requiresDataInput(Def())) {
       grad_ins.push_back(I(0));
       if (SparseFused) {
         grad_ins.push_back(I(ForwardOp::INDICES));
         indices_pushed = true;
       }
       suffix = "WithMainInput" + suffix;
     }
     if (GradientNeedIndices && !indices_pushed) {
       if (SparseFused) {
         grad_ins.push_back(I(ForwardOp::INDICES));
       } else {
         // Hacky: using Input as Indices, remove this after we have specialized
         // cuda LengthsIndicesInGradientSumGradient
         grad_ins.push_back(I(0));
       }
     }
     vector<string> grad_outs;
     grad_outs.push_back({SparseFused ? GI_V(0) : GI(0)});
     int aux_grads = ReducerGradient::numAuxInputsWithGrads(Def());
     for (int i = 1; i <= aux_grads; ++i) {
       grad_outs.push_back(GI(i));
     }
     vector<OperatorDef> r{CreateOperatorDef(
         string(SparseFused ? "SparseLengths" : "Lengths") +
             string(GradientNeedIndices ? "IndicesInGradient" : "") +
             ReducerDef::name + suffix,
         "",
         grad_ins,
         grad_outs)};
     if (SparseFused) {
       SetSparse(0, I(ForwardOp::INDICES), GI_V(0));
     }
     return r;
   }
 };
 
 template <
     typename T,
     typename SIndex,
     typename Context,
     typename ReducerDef,
     bool GradientNeedIndices = false>
 struct AbstractLengthsDef {
   using OpDef = ReducerDef;
   static constexpr const char* basename = "Lengths";
   static constexpr const char* doc = R"DOC(
 Applies '{op}' to each segment of the input tensor. Segments are defined
 by their LENGTHS.
 
 LENGTHS is a vector that maps each of the first dimension slices of the
 DATA to a particular group (segment). Values belonging to the same segment are
 aggregated together.
 
 For example LENGTHS = [2, 1] stands for segments DATA[0..1] and DATA[2]
 
 The first dimension of the output is equal to the number of input segments,
 i.e. `len(LENGTHS)`. Other dimensions are inherited from the input tensor.
 
 {op_doc}
   )DOC";
   static void PopulateSchema(OpSchema& schema) {
     schema.Input(0, "DATA", "Input tensor, slices of which are aggregated.");
     schema.Input(
         Reducer::kInputCount,
         "LENGTHS",
         "Vector with the same sum of elements as the first dimension of DATA");
     schema.Output(
         0,
         "OUTPUT",
         "Aggregated output tensor. Has the first dimension of len(LENGTHS) ");
     schema.TensorInferenceFunction(
         [](const OperatorDef& def, const vector<TensorShape>& in) {
           vector<TensorShape> out(0);
           TensorShape output;
           for (int d : in[Reducer::kInputCount].dims()) {
             output.add_dims(d);
           }
           for (int j = 1; j < in[0].dims_size(); j++) {
             output.add_dims(in[0].dims(j));
           }
           output.set_data_type(in[0].data_type());
           out.push_back(output);
           return out;
         });
     ReducerDef::PopulateSchema(schema);
   }
   using Reducer = typename ReducerDef::template Reducer<T, Context>;
   using ReducerGradient =
       typename ReducerDef::template ReducerGradient<T, Context>;
   using ForwardOp = AbstractLengthsOp<T, SIndex, Context, Reducer, false>;
   using BackwardOp =
       AbstractLengthsGradientOp<T, SIndex, Context, ReducerGradient>;
   using WithMainInputBackwardOp = AbstractLengthsWithMainInputGradientOp<
       T,
       SIndex,
       Context,
       ReducerGradient,
       false>;
   using WithMainInputAndForwardOutputBackwardOp =
       AbstractLengthsWithMainInputAndForwardOutputGradientOp<
           T,
           SIndex,
           Context,
           ReducerGradient>;
   using GetGradient = LengthsOpGetGradient<
       ForwardOp,
       ReducerDef,
       ReducerGradient,
       false /*SparseFused*/,
       GradientNeedIndices>;
 };
 
 template <
     typename T,
     typename SIndex,
     typename Context,
     typename ReducerDef,
     bool GradientNeedIndices = false>
 struct AbstractSparseLengthsDef {
   using OpDef = ReducerDef;
   static constexpr const char* basename = "SparseLengths";
   static constexpr const char* doc = R"DOC(
 Pulls in slices of the input tensor, groups them into segments and applies
 '{op}' to each segment. Segments are defined by their LENGTHS.
 
 This op is basically Gather and Lengths{op} fused together.
 
 INDICES should contain integers in range 0..N-1 where N is the first dimension
 of DATA. INDICES represent which slices of DATA need to be pulled in.
 
 LENGTHS is a vector that defines slice sizes by first dimention of DATA. Values
 belonging to the same segment are aggregated together. sum(LENGTHS) has
 to match INDICES size.
 
 The first dimension of the output is equal to the number of input segment,
 i.e. `len(LENGTHS)`. Other dimensions are inherited from the input tensor.
 
 {op_doc}
   )DOC";
   static void PopulateSchema(OpSchema& schema) {
     schema.Input(0, "DATA", "Input tensor, slices of which are aggregated.");
     schema.Input(
         Reducer::kInputCount,
         "INDICES",
         "Integer vector containing indices of the first dimension of DATA for "
         "the slices that are being aggregated");
     schema.Input(
         Reducer::kInputCount + 1,
         "LENGTHS",
         "Non negative vector with sum of elements equal to INDICES length");
     schema.Output(
         0,
         "OUTPUT",
         "Aggregated output tensor. Has the first dimension of K "
         "(the number of segments).");
     ReducerDef::PopulateSchema(schema);
   }
   using Reducer = typename ReducerDef::template Reducer<T, Context>;
   using ReducerGradient =
       typename ReducerDef::template ReducerGradient<T, Context>;
   using ForwardOp = AbstractLengthsOp<T, SIndex, Context, Reducer>;
   // TODO(dzhulgakov): we're registering the same class twice here,
   // consider avoiding op duplication here
   // Note: registering 2 input version for now because of naming in the macro,
   // will register 3 input version alone
   /* INDICES are not used in CPU version, but they are needed in async CUDA
    *    version. So we register 3 input version for CPU as gradient op for
    *    GPU/CPU convert. We then register 2 input version for CPU for backward
    *    compatibility with older nets.
    */
   using BackwardOp = AbstractLengthsGradientOp<
       T,
       SIndex,
       Context,
       ReducerGradient,
       false /*GradientNeedIndices*/>;
   using WithMainInputBackwardOp = AbstractLengthsWithMainInputGradientOp<
       T,
       SIndex,
       Context,
       ReducerGradient>;
   // Will return 3 input version. This is aliging new CPU/GPU nets.
   using GetGradient = LengthsOpGetGradient<
       ForwardOp,
       ReducerDef,
       ReducerGradient,
       true /*SparseFused*/,
       GradientNeedIndices>;
 };
 } // namespace caffe2
 
 #endif // CAFFE2_OPERATORS_SEGMENT_REDUCTION_OP_H_
Facebook Open Source