Caffe2 - C++ API
A deep learning, cross platform ML framework
utility_ops.cc
1 #include "caffe2/operators/utility_ops.h"
2 
3 #include <cmath>
4 
5 namespace caffe2 {
6 
7 template <>
8 bool WeightedSumOp<CPUContext>::RunOnDevice() {
9  return DoRunWithType<float>();
10 }
11 
12 template <>
13 bool WeightedSumGradientOp<CPUContext>::RunOnDevice() {
14  return DoRunWithType<float>();
15 }
16 
17 template <>
18 template <typename T>
19 void UniqueOp<CPUContext>::DoRun() {
20  auto& inputTensor = Input(0);
21  // use dim32 to enforce that it's fine to have remapping of type int
22  int N = inputTensor.dim32(0);
23  CAFFE_ENFORCE_EQ(inputTensor.ndim(), 1, "Input should be a vector");
24  auto* uniqueTensor = Output(UNIQUE);
25 
26  int* remapping = nullptr;
27  if (REMAPPING < OutputSize()) {
28  auto* remappingTensor = Output(REMAPPING);
29  remappingTensor->ResizeLike(inputTensor);
30  remapping = remappingTensor->template mutable_data<int>();
31  }
32 
33  const T* input = inputTensor.template data<T>();
34  // TODO(dzhulgakov): if perf becomes an issue consider doing hash table
35  // instead of sorting
36  order_.resize(N);
37  std::iota(order_.begin(), order_.end(), 0);
38  std::sort(order_.begin(), order_.end(), [input](const int x, const int y) {
39  return input[x] < input[y];
40  });
41  int K = N;
42  for (int i = 1; i < N; ++i) {
43  K -= input[order_[i]] == input[order_[i - 1]];
44  }
45  uniqueTensor->Resize(K);
46  T* unique = uniqueTensor->template mutable_data<T>();
47  K = 0;
48  T prev = -1;
49  for (int i = 0; i < N; ++i) {
50  if (i == 0 || prev != input[order_[i]]) {
51  prev = unique[K++] = input[order_[i]];
52  }
53  if (remapping) {
54  remapping[order_[i]] = K - 1;
55  }
56  }
57 }
58 
59 REGISTER_CPU_OPERATOR(WallClockTime, WallClockTimeOp<CPUContext>);
60 REGISTER_CPU_OPERATOR(Print, PrintOp<CPUContext>);
61 REGISTER_CPU_OPERATOR(FlattenToVec, FlattenToVecOp<CPUContext>);
62 REGISTER_CPU_OPERATOR(Alias, AliasOp<CPUContext>);
63 REGISTER_CPU_OPERATOR(ResizeLike, ResizeLikeOp<CPUContext>);
64 REGISTER_CPU_OPERATOR(SumInt, SumOp<CPUContext>);
65 REGISTER_CPU_OPERATOR(WeightedSum, WeightedSumOp<CPUContext>);
66 REGISTER_CPU_OPERATOR(WeightedSumGradient, WeightedSumGradientOp<CPUContext>);
67 REGISTER_CPU_OPERATOR(
68  ScatterWeightedSum,
69  ScatterWeightedSumOp<float, CPUContext>);
70 REGISTER_CPU_OPERATOR(ScatterAssign, ScatterAssignOp<CPUContext>);
71 // From whatever the current context, ensure the output is TensorCPU
72 REGISTER_CPU_OPERATOR(
73  EnsureCPUOutput,
74  CopyOp<CPUContext, CPUContext, CPUContext>);
75 // From CPU, copy it to whatever the current context
76 REGISTER_CPU_OPERATOR(
77  CopyFromCPUInput,
78  CopyOp<CPUContext, CPUContext, CPUContext>);
79 REGISTER_CPU_OPERATOR(
80  CopyOnDeviceLike,
81  CopyOnDeviceLikeOp<CPUContext, CPUContext, CPUContext>);
82 REGISTER_CPU_OPERATOR(Copy, CopyOp<CPUContext, CPUContext, CPUContext>);
83 REGISTER_CPU_OPERATOR(LengthsToShape, LengthsToShapeOp<CPUContext>);
84 REGISTER_CPU_OPERATOR(HasElements, HasElementsOp<CPUContext>);
85 REGISTER_CPU_OPERATOR(IsEmpty, IsEmptyOp<CPUContext>);
86 REGISTER_CPU_OPERATOR(Gather, GatherOp<CPUContext>);
87 REGISTER_CPU_OPERATOR(GatherRanges, GatherRangesOp<CPUContext>);
88 REGISTER_CPU_OPERATOR(LengthsGather, LengthsGatherOp<CPUContext>);
89 REGISTER_CPU_OPERATOR(Unique, UniqueOp<CPUContext>);
90 REGISTER_CPU_OPERATOR(LengthsToSegmentIds, LengthsToSegmentIdsOp<CPUContext>);
91 REGISTER_CPU_OPERATOR(LengthsToRanges, LengthsToRangesOp<CPUContext>);
92 REGISTER_CPU_OPERATOR(SegmentIdsToLengths, SegmentIdsToLengthsOp<CPUContext>);
93 REGISTER_CPU_OPERATOR(SegmentIdsToRanges, SegmentIdsToRangesOp<CPUContext>);
94 REGISTER_CPU_OPERATOR(LengthsToWeights, LengthsToWeightsOp<CPUContext>);
95 REGISTER_CPU_OPERATOR(EnsureDense, EnsureDenseOp<CPUContext>);
96 REGISTER_CPU_OPERATOR(
97  AccumulateHistogram,
98  AccumulateHistogramOp<float, CPUContext>);
99 
100 OPERATOR_SCHEMA(WallClockTime)
101  .NumInputs(0)
102  .NumOutputs(1)
103  .SetDoc("Time since epoch in nanoseconds.")
104  .Output(0, "time", "The time in nanoseconds.");
105 
106 REGISTER_CPU_OPERATOR(UnsafeCoalesce, UnsafeCoalesceOp<CPUContext>);
107 
108 OPERATOR_SCHEMA(Print)
109  .NumInputs(1)
110  .NumOutputs(0)
111  .SetDoc("Logs shape and contents of input tensor to stderr or to a file.")
112  .Arg(
113  "to_file",
114  "(bool) if 1, saves contents to the root folder of the current "
115  "workspace, appending the tensor contents to a file named after "
116  "the blob name. Otherwise, logs to stderr.")
117  .Input(0, "tensor", "The tensor to print.");
118 
119 OPERATOR_SCHEMA(LengthsToShape).NumInputs(1).NumOutputs(1);
120 
121 OPERATOR_SCHEMA(FlattenToVec)
122  .NumInputs(1)
123  .NumOutputs(1)
124  .TensorInferenceFunction([](const OperatorDef& /*def*/,
125  const vector<TensorShape>& in) {
126  vector<TensorShape> out(1);
127  int total = 1;
128  for (auto d : in[0].dims()) {
129  total *= d;
130  }
131  out[0].set_data_type(in[0].data_type());
132  out[0].add_dims(total);
133  return out;
134  })
135  .SetDoc(R"DOC(
136 Flattens the input tensor into a 1D vector.
137 )DOC")
138  .Input(0, "input", "A tensor of rank >= 1.")
139  .Output(
140  0,
141  "output",
142  "A tensor of rank 1 with the contents of the input tensor");
143 
144 OPERATOR_SCHEMA(Alias)
145  .NumInputs(1)
146  .NumOutputs(1)
147  .IdenticalTypeAndShape()
148  .SetDoc(R"DOC(
149 Makes the output and the input share the same underlying storage.
150 
151 WARNING: in general, in caffe2's operator interface different tensors should
152 have different underlying storage, which is the assumption made by
153 components such as the dependency engine and memory optimization. Thus, in
154 normal situations you should not use the AliasOp, especially in a normal
155 forward-backward pass.
156 
157 The Alias op is provided so one can achieve true asynchrony, such as
158 Hogwild, in a graph. But make sure you understand all the implications
159 similar to multi-thread computation before you use it explicitly.
160 )DOC")
161  .Input(0, "input", "Input tensor whose storage will be shared.")
162  .Output(0, "output", "Tensor of same shape as input, sharing its storage.");
163 
164 OPERATOR_SCHEMA(ResizeLike)
165  .NumInputs(2)
166  .NumOutputs(1)
167  .TensorInferenceFunction([](const OperatorDef& /*def*/,
168  const vector<TensorShape>& in) {
169  vector<TensorShape> out(1);
170  out.push_back(in[1]);
171  out[0].set_data_type(in[0].data_type());
172  return out;
173  })
174  .SetDoc(R"DOC(
175 Produces tensor containing data of first input and shape of second input.
176 )DOC")
177  .Input(0, "data", "Tensor whose data will be copied into the output.")
178  .Input(1, "shape_tensor", "Tensor whose shape will be applied to output.")
179  .Output(0, "output", "Tensor with data of input 0 and shape of input 1.");
180 
181 OPERATOR_SCHEMA(SumInt)
182  .NumInputs(1, INT_MAX)
183  .NumOutputs(1)
184  .InputsCanCrossDevices()
185  .TensorInferenceFunction([](const OperatorDef& /*def*/,
186  const vector<TensorShape>& in) {
187  vector<TensorShape> out(1);
188  out.push_back(in[0]);
189  out[0].set_data_type(TensorProto::INT32);
190  return out;
191  })
192  .AllowInplace({{0, 0}});
193 
194 OPERATOR_SCHEMA(WeightedSum)
195  .NumInputs([](int n) { return (n > 0 && n % 2 == 0); })
196  .NumOutputs(1)
197  .AllowInplace({{0, 0}})
198  .IdenticalTypeAndShapeOfInput(0)
199  .SetDoc(R"DOC(
200 Element-wise weighted sum of several data, weight tensor pairs.
201 Input should be in the form X_0, weight_0, X_1, weight_1, ... where X_i all
202 have the same shape, and weight_i are size 1 tensors that specifies the weight
203 of each vector. Note that if one wants to do in-place computation, it could
204 only be done with X_0 also as the output, but not other X_i.
205 )DOC")
206  .Input(0, "data_0", "First of the input tensors.")
207  .Input(0, "weight_0", "Weight of the first input in the sum.")
208  .Output(0, "output", "Result containing weighted elem-wise sum of inputs.");
209 
210 OPERATOR_SCHEMA(WeightedSumGradient)
211  .NumInputs([](int n) { return (n > 0 && n % 2 == 1); })
212  .NumOutputs(1, INT_MAX);
213 
214 OPERATOR_SCHEMA(ScatterWeightedSum)
215  .NumInputs([](int n) { return (n > 3 && (n - 3) % 2 == 0); })
216  .NumOutputs(1)
217  .EnforceInplace({{0, 0}})
218  .SetDoc(R"DOC(
219 Similar to WeightedSum, computes the weighted sum of several tensors, with
220 the difference that inputs are sliced tensors. The first tensor has to be
221 in-place and only slices of it on the first dimension as indexed by INDICES
222 will be updated.
223 
224 Note: The op pretty much ignores the exact shapes of the input arguments and
225 cares only about sizes. It's done for performance consideration to avoid
226 unnecessary reshapes. Only first dimension of X_0 is important, let's call it
227 N. If M is the total size of X_0 and K is the size of INDICES then X_i is
228 assumed to be of shape K x (M / N) regardless of the real shape.
229 
230 Note: Each update in INDICES is applied independently which means that if
231 duplicated elements are present in INDICES the corresponding slice of X_0
232 will be scaled multiple times. Manual collapsing of INDICES is required
233 beforehand if necessary.
234 
235 Note: Updates are applied sequentially by inputs which might have undesired
236 consequences if the input tensor is accessed concurrently by different op
237 (e.g. when doing Hogwild). Other threads might see intermediate results even
238 on individual slice level, e.g. X_0 scaled by weight_0 but without any
239 updates applied.
240 
241 Currently only works on CPU because of access to INDICES.
242 )DOC")
243  .Input(0, "X_0", "Tensor to be updated.")
244  .Input(
245  1,
246  "Weight_0",
247  "Scalar weight for X_0, applied only to slices affected.")
248  .Input(
249  2,
250  "INDICES",
251  "1-D list of indices on the first dimension of X_0 "
252  "that need to be updated")
253  .Input(3, "X_1", "Update slices, with shape len(INDICES) + shape(X_0)[1:]")
254  .Input(4, "Weight_1", "Scalar weight for X_1 update")
255  .Output(0, "X_0", "Has to be exactly the same tensor as the input 0")
256  .EnforceInplace({{0, 0}});
257 
258 OPERATOR_SCHEMA(ScatterAssign)
259  .NumInputs(3)
260  .NumOutputs(1)
261  .EnforceInplace({{0, 0}})
262  .SetDoc(R"DOC(
263 Update slices of the tensor in-place by overriding current value.
264 
265 Note: The op pretty much ignores the exact shapes of the input arguments and
266 cares only about sizes. It's done for performance consideration to avoid
267 unnecessary reshapes. Only first dimension of X_0 is important, let's call it
268 N. If M is the total size of X_0 and K is the size of INDICES then X_i is
269 assumed to be of shape K x (M / N) regardless of the real shape.
270 
271 Note: Each update in INDICES is applied independently which means that if
272 duplicated elements are present in INDICES arbitrary one will win.
273 
274 Currently only works on CPU because of access to INDICES.
275 )DOC")
276  .Input(0, "DATA", "Tensor to be updated.")
277  .Input(
278  1,
279  "INDICES",
280  "1-D list of indices on the first dimension"
281  "of X_0 that need to be updated")
282  .Input(
283  2,
284  "SLICES",
285  "Update slices, with shape len(INDICES) + shape(X_0)[1:]")
286  .Output(0, "DATA", "Has to be exactly the same tensor as the input 0");
287 
288 OPERATOR_SCHEMA(Copy)
289  .NumInputs(1)
290  .NumOutputs(1)
291  .IdenticalTypeAndShape()
292  .InputsCanCrossDevices()
293  .SetDoc("Copy input tensor into output, potentially across devices.")
294  .Input(0, "input", "The input tensor.")
295  .Output(0, "output", "Tensor that will contain a copy of the input.");
296 
297 OPERATOR_SCHEMA(CopyGPUToCPU)
298  .NumInputs(1)
299  .NumOutputs(1)
300  .IdenticalTypeAndShape()
301  .InputsCanCrossDevices()
302  .DeviceInferenceFunction([](const OperatorDef& def) {
303  CAFFE_ENFORCE(
304  def.has_device_option(),
305  "CopyGPUToCPU op should have cuda device option.");
306  auto& cuda_option = def.device_option();
307  auto cpu_option = DeviceOption();
308  vector<DeviceOption> in_dev(def.input_size(), cuda_option);
309  vector<DeviceOption> out_dev(def.output_size(), cpu_option);
310  return std::make_pair(in_dev, out_dev);
311  })
312  .SetDoc(R"DOC(
313 Copy tensor for GPU to CPU context. Must be run under GPU device option.
314 )DOC")
315  .Input(0, "input", "The input tensor.")
316  .Output(0, "output", "Tensor that will contain a copy of the input.");
317 
318 OPERATOR_SCHEMA(CopyCPUToGPU)
319  .NumInputs(1)
320  .NumOutputs(1)
321  .IdenticalTypeAndShape()
322  .InputsCanCrossDevices()
323  .DeviceInferenceFunction([](const OperatorDef& def) {
324  CAFFE_ENFORCE(
325  def.has_device_option(),
326  "CopyCPUToGPU op should have cuda device option.");
327  auto& cuda_option = def.device_option();
328  auto cpu_option = DeviceOption();
329  vector<DeviceOption> in_dev(def.input_size(), cpu_option);
330  vector<DeviceOption> out_dev(def.output_size(), cuda_option);
331  return std::make_pair(in_dev, out_dev);
332  })
333  .SetDoc(R"DOC(
334 Copy tensor for CPU to GPU context. Must be run under GPU device option.
335 )DOC")
336  .Input(0, "input", "The input tensor.")
337  .Output(0, "output", "Tensor that will contain a copy of the input.");
338 
339 OPERATOR_SCHEMA(EnsureCPUOutput)
340  .NumInputs(1)
341  .NumOutputs(1)
342  .IdenticalTypeAndShape()
343  .InputsCanCrossDevices()
344  .DeviceInferenceFunction([](const OperatorDef& def) {
345  auto op_device =
346  def.has_device_option() ? def.device_option() : DeviceOption();
347  auto cpu_option = DeviceOption();
348  vector<DeviceOption> in_dev(def.input_size(), op_device);
349  vector<DeviceOption> out_dev(def.output_size(), cpu_option);
350  return std::make_pair(in_dev, out_dev);
351  })
352  .SetDoc(R"DOC(
353 Take an input tensor in the current Context (GPU or CPU) and create an output
354 which is always a TensorCPU. This may involves cross-device MemCpy.
355 )DOC")
356  .Input(0, "input", "The input CUDA or CPU tensor.")
357  .Output(0, "output", "TensorCPU that is a copy of the input.");
358 
359 OPERATOR_SCHEMA(CopyFromCPUInput)
360  .NumInputs(1)
361  .NumOutputs(1)
362  .IdenticalTypeAndShape()
363  .InputsCanCrossDevices()
364  .DeviceInferenceFunction([](const OperatorDef& def) {
365  auto op_device =
366  def.has_device_option() ? def.device_option() : DeviceOption();
367  auto cpu_option = DeviceOption();
368  vector<DeviceOption> in_dev(def.input_size(), cpu_option);
369  vector<DeviceOption> out_dev(def.output_size(), op_device);
370  return std::make_pair(in_dev, out_dev);
371  })
372  .SetDoc(R"DOC(
373 Take a CPU input tensor and copy it to an output in the current
374 Context (GPU or CPU). This may involves cross-device MemCpy.
375 )DOC")
376  .Input(0, "input", "The input CPU tensor.")
377  .Output(0, "output", "either a TensorCUDA or a TensorCPU");
378 
379 OPERATOR_SCHEMA(CopyOnDeviceLike)
380  .NumInputs(2)
381  .NumOutputs(1)
382  .SetDoc("Copy input tensor into output to the specific device.")
383  .Input(0, "input", "The input tensor.")
384  .Input(1, "dst", "Tensor, on which device the copy will be performed.")
385  .Output(0, "output", "Tensor that will contain a copy of the input.");
386 
387 OPERATOR_SCHEMA(HasElements)
388  .NumInputs(1)
389  .NumOutputs(1)
390  .SetDoc("Returns true iff the input tensor has size > 0")
391  .Input(0, "tensor", "Tensor of any type.")
392  .Output(
393  0,
394  "has_elements",
395  "Scalar bool tensor. True if input is not empty.");
396 
397 OPERATOR_SCHEMA(IsEmpty)
398  .NumInputs(1)
399  .NumOutputs(1)
400  .SetDoc("Returns true iff the input tensor has size == 0")
401  .ScalarType(::caffe2::TensorProto_DataType::TensorProto_DataType_BOOL)
402  .Input(0, "tensor", "Tensor of any type.")
403  .Output(0, "is_empty", "Scalar bool tensor. True if input is empty.");
404 
405 OPERATOR_SCHEMA(Gather)
406  .NumInputs(2)
407  .NumOutputs(1)
408  .SetDoc(R"DOC(
409 Given DATA tensor of rank r >= 1, and INDICES tensor of rank q, gather
410 entries of the outer-most dimension of DATA indexed by INDICES, and concatenate
411 them in an output tensor of rank q + (r - 1).
412 
413 Example:
414  DATA = [
415  [1.0, 1.2],
416  [2.3, 3.4],
417  [4.5, 5.7],
418  ]
419  INDICES = [
420  [0, 1],
421  [1, 2],
422  ]
423  OUTPUT = [
424  [
425  [1.0, 1.2],
426  [2.3, 3.4],
427  ],
428  [
429  [2.3, 3.4],
430  [4.5, 5.7],
431  ],
432  ]
433 )DOC")
434  .Input(0, "DATA", "Tensor of rank r >= 1.")
435  .Input(1, "INDICES", "Tensor of int32/int64 indices, of any rank q.")
436  .Output(0, "OUTPUT", "Tensor of rank q + (r - 1).")
437  .TensorInferenceFunction([](const OperatorDef& def,
438  const vector<TensorShape>& in) {
439  vector<TensorShape> out(1);
440  for (auto d : in[1].dims()) {
441  out[0].add_dims(d);
442  }
443  for (int i = 1; i < in[0].dims_size(); ++i) {
444  out[0].add_dims(in[0].dims(i));
445  }
446  out[0].set_data_type(in[0].data_type());
447  return out;
448  });
449 
450 OPERATOR_SCHEMA(GatherRanges)
451  .NumInputs(2)
452  .NumOutputs(2)
453  .SetDoc(R"DOC(
454 Given DATA tensor of rank 1, and RANGES tensor of rank 3, gather
455 corresponding ranges into a 1-D tensor OUTPUT.
456 
457 RANGES dimentions description:
458 1: represents list of examples within a batch
459 2: represents list features
460 3: two values which are start and length or a range (to be applied on DATA)
461 
462 Another output LENGTHS represents each example length within OUTPUT
463 
464 Example:
465  DATA = [1, 2, 3, 4, 5, 6]
466  RANGES = [
467  [
468  [0, 1],
469  [2, 2],
470  ],
471  [
472  [4, 1],
473  [5, 1],
474  ]
475  ]
476  OUTPUT = [1, 3, 4, 5, 6]
477  LENGTHS = [3, 2]
478 )DOC")
479  .Input(0, "DATA", "Tensor of rank 1.")
480  .Input(
481  1,
482  "RANGES",
483  "Tensor of int32/int64 ranges, of dims (N, M, 2). "
484  "Where N is number of examples and M is a size of each example. "
485  "Last dimension represents a range in the format (start, lengths)")
486  .Output(0, "OUTPUT", "1-D tensor of size sum of range lengths")
487  .Output(
488  1,
489  "LENGTHS",
490  "1-D tensor of size N with lengths over gathered data"
491  " for each row in a batch. sum(LENGTHS) == OUTPUT.size()")
492  .TensorInferenceFunction([](const OperatorDef& /* unused */,
493  const vector<TensorShape>& in) {
494  std::vector<TensorShape> out(2);
495 
496  int total = 1;
497  for (auto d : in[0].dims()) {
498  total *= d;
499  }
500  out[0].add_dims(total);
501  out[0].set_data_type(in[0].data_type());
502  out[1].add_dims(in[1].dims(0));
503  out[1].set_data_type(in[1].data_type());
504  return out;
505  });
506 
507 OPERATOR_SCHEMA(LengthsGather)
508  .NumInputs(3)
509  .NumOutputs(1)
510  .SetDoc(R"DOC(
511 Gather items from sparse tensor. Sparse tensor is described by items and
512 lengths. This operator gathers items corresponding to lengths at the given
513 indices. This deliberately doesn't return lengths of OUTPUTS so that both lists
514 and maps can be supported without special cases. If you need lengths tensor for
515  OUTPUT, use `Gather`.
516 
517 Example:
518  ITEMS = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
519  LENGTHS = [0, 2, 3, 1, 4]
520  INDICES = [0, 2, 4]
521 
522  OUTPUT = [2, 3, 4, 6, 7, 8, 9]
523 )DOC")
524  .Input(0, "ITEMS", "items tensor")
525  .Input(1, "LENGTHS", "lengths tensor")
526  .Input(2, "INDICES", "indices into LENGTHS where items should be gathered")
527  .Output(0, "OUTPUT", "1-D tensor containing gathered items");
528 
529 OPERATOR_SCHEMA(Unique)
530  .NumInputs(1)
531  .NumOutputs(1, 2)
532  .SetDoc(R"DOC(
533 Deduplicates input indices vector and optionally produces reverse remapping.
534 There's no guarantees on the ordering of the output indices.
535 )DOC")
536  .Input(0, "indices", "1D tensor of int32 or int64 indices.")
537  .Output(0, "unique_indices", "1D tensor of deduped entries.")
538  .Output(
539  1,
540  "remapping",
541  "(optional) mapping from `indices` to `unique_indices`. This has the "
542  "same shape as `indices`. Its elements are the indices into "
543  "`unique_indices` such that `Gather(['unique_indices', 'remapping'])` "
544  "yields `indices`.")
545  .TensorInferenceFunction([](const OperatorDef& def,
546  const vector<TensorShape>& in) {
547  std::vector<TensorShape> out(1);
548  out[0].set_data_type(in[0].data_type());
549  CAFFE_ENFORCE_EQ(in[0].dims_size(), 1);
550  if (in[0].dims(0) <= 1) {
551  // This special case is useful in some situation, e.g., when feeding
552  // tensor inference with empty tensor (where the first dim is the batch
553  // size)
554  out[0].add_dims(in[0].dims(0));
555  } else {
556  out[0].set_unknown_shape(true);
557  }
558  if (def.output_size() > 1) {
559  // Remapping has the same shape as the input tensor
560  out.push_back(in[0]);
561  out.back().set_data_type(TensorProto::INT32);
562  }
563  return out;
564  });
565 
566 OPERATOR_SCHEMA(LengthsToSegmentIds)
567  .NumInputs(1)
568  .NumOutputs(1)
569  .SetDoc(R"DOC(
570 Given a vector of segment lengths, returns a zero-based, consecutive vector
571 of segment_ids. For example, [1, 3, 0, 2] will produce [0, 1, 1, 1, 3, 3].
572 In general, the inverse operation is SegmentIdsToLengths. Notice though that
573 trailing empty sequence lengths can't be properly recovered from segment ids.
574 )DOC")
575  .Input(0, "lengths", "1D tensor of int32 or int64 segment lengths.")
576  .Output(0, "segment_ids", "1D tensor of length `sum(lengths)`");
577 
578 OPERATOR_SCHEMA(LengthsToRanges)
579  .NumInputs(1)
580  .NumOutputs(1)
581  .SetDoc(R"DOC(
582 Given a vector of segment lengths, calculates offsets of each segment and packs
583 them next to the lengths. For the input vector of length N the output is a Nx2
584 matrix with (offset, lengths) packaged for each segment.
585 
586 For example, `[1, 3, 0, 2]` transforms into `[[0, 1], [1, 3], [4, 0], [4, 2]]`.
587 )DOC")
588  .Input(0, "lengths", "1D tensor of int32 segment lengths.")
589  .Output(
590  0,
591  "ranges",
592  "2D tensor of shape len(lengths) X 2 and the same type as `lengths`");
593 
594 OPERATOR_SCHEMA(SegmentIdsToLengths)
595  .NumInputs(1, 2)
596  .NumOutputs(1)
597  .SetDoc(R"DOC(
598 Transfers a vector of segment ids to a vector of segment lengths. This operation
599 supports non-consecutive segment ids. Segments not appearing in the input vector
600 will have length 0. If the second input is provided, the number of segments =
601 the size of its first dimension. Otherwise, the number of segments = the last
602 index in the first input vector + 1.
603 
604 In general, for consecutive, zero-based segment IDs, this is the inverse
605 operation of LengthsToSegmentIds, except that a vector of segment IDs
606 cannot represent empty segments at the end (if the second input is absent).
607 )DOC")
608  .Input(0, "segment_ids", "1-D int32_t or int64_t tensor of segment ids")
609  .Input(
610  1,
611  "data (optional)",
612  "if provided, number of segments = the size of its first dimension")
613  .Output(0, "lengths", "1-D int64_t tensor of segment lengths");
614 
615 OPERATOR_SCHEMA(SegmentIdsToRanges)
616  .NumInputs(1, 2)
617  .NumOutputs(1)
618  .SetDoc(R"DOC(
619 Transfers a vector of segment ids to a vector of segment ranges. This operation
620 supports non-consecutive segment ids. Segments not appearing in the input vector
621 will have length 0. If the second input is provided, the number of segments =
622 the size of its first dimension. Otherwise, the number of segments = the last
623 index in the first input vector + 1.
624 )DOC")
625  .Input(0, "segment_ids", "1-D int32_t or int64_t tensor of segment ids")
626  .Input(
627  1,
628  "data (optional)",
629  "if provided, number of segments = the size of its first dimension")
630  .Output(0, "lengths", "1-D int64_t tensor of segment lengths");
631 
632 OPERATOR_SCHEMA(LengthsToWeights)
633  .NumInputs(1)
634  .NumOutputs(1)
635  .Arg("power", "n of 1/pow(length,n) for normalization")
636  .SetDoc(R"DOC(
637 Similar as LengthsToSegmentIds but output vector of segment
638 weights derived by lengths. i.e 1/pow(length, power)
639 )DOC")
640  .Input(0, "lengths", "1-D int32_t or int64_t tensor of lengths")
641  .Output(0, "a vector of weights", "1-D float tensor of weights by length");
642 
643 
644 
645 SHOULD_NOT_DO_GRADIENT(WallClockTime);
646 
647 OPERATOR_SCHEMA(UnsafeCoalesce)
648  .NumInputsOutputs([](int inputs, int outputs) {
649  return inputs + 1 == outputs;
650  })
651  .AllowInplace([](int input, int output) { return input == output; })
652  .SetDoc(R"DOC(
653 Coalesce the N inputs into N outputs and a single coalesced output blob.
654 
655 This allows operations that operate over multiple small kernels (e.g.
656 biases in a deep CNN) to be coalesced into a single larger operation,
657 amortizing the kernel launch overhead, synchronization costs for
658 distributed computation, etc.
659 
660 The operator:
661 
662 - computes the total size of the coalesced blob by summing the input sizes
663 - allocates the coalesced output blob as the total size
664 - copies the input vectors into the coalesced blob, at the correct offset.
665 - aliases each Output(i) to- point into the coalesced blob, at the corresponding offset for Input(i).
666 
667 This is 'unsafe' as the output vectors are aliased, so use with
668 caution.
669 
670 )DOC");
671 
672 OPERATOR_SCHEMA(EnsureDense)
673  .NumInputs(1)
674  .NumOutputs(1)
675  .AllowInplace({{0, 0}})
676  .IdenticalTypeAndShape()
677  .SetDoc(R"DOC(
678 This operator converts dense or sparse gradients to dense ones.
679 Therefore, sparse gradient can be back propagated to Operators that consume
680 dense gradients only (e.g., FCGradient).
681 
682 The operator's behaviors:
683 
684 - In forward, simply pass in place or copy input to the output.
685 - In backward, if the gradient passed-in is sparse gradient, change it to dense gradient in linear time; otherwise, simply pass the dense gradient.
686 )DOC")
687  .Input(0, "input", "Input tensors.")
688  .Output(0, "output", "Output tensor. Same dimension as inputs.");
689 
690 OPERATOR_SCHEMA(AccumulateHistogram)
691  .NumInputs(1)
692  .NumOutputs(2)
693  .SetDoc(R"DOC(
694 This operator calculate thes histogram of values in input tensor.
695 There're 2 outputs, one for histogram of current input tensor, and another
696 for histogram of the all input tensors accumulated through history.
697 The output would contain num_buckets + 2 values. index[1 ... num_buckets]
698 for values in [lower_bound, upper_bound) interval. And the rest 2 for values
699 smaller than lower_bound or greater than upper_bound respectively.
700 )DOC")
701  .Input(0, "X", "Input tensor.")
702  .Output(0, "CurHist", "Output histogram of the current tensor.")
703  .Output(1, "AccHist", "Accumulated histogram of the history tensor.")
704  .Arg("lower_bound", "the lower bound value")
705  .Arg("upper_bound", "the upper bound value")
706  .Arg(
707  "num_buckets",
708  "number of buckets to use in [lower_bound, upper_bound)");
709 
710 class GetEnsureDenseGradient : public GradientMakerBase {
711  using GradientMakerBase::GradientMakerBase;
712  vector<OperatorDef> GetGradientDefs() override {
713  CAFFE_ENFORCE(
714  GradOut(0).IsSparse() || GradOut(0).IsDense(),
715  "Input gradient ",
716  O(0),
717  " should be either sparse or dense.");
718 
719  if (GradOut(0).IsDense()) {
720  SetDense(0, GO(0));
721  return vector<OperatorDef>();
722  } else {
723  return SingleGradientDef(
724  "SparseToDense",
725  "",
726  vector<string>{GO_I(0), GO_V(0), I(0)},
727  vector<string>{GI(0)});
728  }
729  }
730 };
731 REGISTER_GRADIENT(EnsureDense, GetEnsureDenseGradient);
732 
733 SHOULD_NOT_DO_GRADIENT(Print);
734 SHOULD_NOT_DO_GRADIENT(HasElements);
735 SHOULD_NOT_DO_GRADIENT(IsEmpty);
736 SHOULD_NOT_DO_GRADIENT(LengthsToShape);
737 SHOULD_NOT_DO_GRADIENT(UnsafeCoalesce);
738 
739 class GetAliasGradient : public GradientMakerBase {
740  using GradientMakerBase::GradientMakerBase;
741  vector<OperatorDef> GetGradientDefs() override {
742  // We will simply pass-along the gradient. Nothing needs to
743  // be calculated.
744  SetDense(0, GO(0));
745  return vector<OperatorDef>();
746  }
747 };
748 REGISTER_GRADIENT(Alias, GetAliasGradient);
749 
750 SHOULD_NOT_DO_GRADIENT(ResizeLike);
751 
752 class GetSumGradient : public GradientMakerBase {
753  using GradientMakerBase::GradientMakerBase;
754  vector<OperatorDef> GetGradientDefs() override {
755  for (auto i = 0; i < def_.input_size(); ++i) {
756  SetDense(i, GO(0));
757  }
758  return vector<OperatorDef>();
759  }
760 };
761 REGISTER_GRADIENT(Sum, GetSumGradient);
762 
763 SHOULD_NOT_DO_GRADIENT(ScatterWeightedSum);
764 SHOULD_NOT_DO_GRADIENT(ScatterAssign);
765 
766 class GetWeightedSumGradient : public GradientMakerBase {
767  using GradientMakerBase::GradientMakerBase;
768  vector<OperatorDef> GetGradientDefs() override {
769  ArgumentHelper argsHelper(def_);
770  const bool grad_on_w = argsHelper.GetSingleArgument<bool>("grad_on_w", 0);
771 
772  auto inputs = vector<string>{GO(0)};
773  auto outputs = vector<string>();
774  for (int i = 0; i < def_.input_size(); i += 2) {
775  inputs.push_back(I(i));
776  inputs.push_back(I(i + 1));
777  outputs.push_back(GI(i));
778  }
779 
780  if (grad_on_w) {
781  for (int i = 0; i < def_.input_size(); i += 2) {
782  outputs.push_back(GI(i + 1));
783  }
784  }
785 
786  return SingleGradientDef("WeightedSumGradient", "", inputs, outputs);
787  }
788 };
789 REGISTER_GRADIENT(WeightedSum, GetWeightedSumGradient);
790 
791 class GetGatherGradient : public GradientMakerBase {
792  using GradientMakerBase::GradientMakerBase;
793  vector<OperatorDef> GetGradientDefs() override {
794  ArgumentHelper argsHelper(def_);
795  const bool dense_gradient =
796  argsHelper.GetSingleArgument<bool>("dense_gradient", false);
797 
798  using Op = GatherOp<CPUContext>;
799 
800  if (dense_gradient) {
801  return vector<OperatorDef>{CreateOperatorDef(
802  "SparseToDense",
803  "",
804  vector<string>{I(Op::INDICES), GO(0), I(Op::DATA)},
805  vector<string>{GI(Op::DATA)})};
806  } else {
807  // For now we don't do any reshaping as the consumer of this op would
808  // probably be ScatterUpdate which is intenionally ignores shapes. We
809  // might need to revisit it in the future for correctness purposes. The
810  // right shape for the output woild be to flatten INDICES and collapse
811  // first X dims of GRAD
812  SetSparse(Op::DATA, I(Op::INDICES), GO(0));
813  return vector<OperatorDef>();
814  }
815  }
816 };
817 REGISTER_GRADIENT(Gather, GetGatherGradient);
818 
819 struct GetFlattenToVecGradient : public GradientMakerBase {
820  using GradientMakerBase::GradientMakerBase;
821  vector<OperatorDef> GetGradientDefs() override {
822  return SingleGradientDef(
823  "ResizeLike", "", vector<string>{GO(0), I(0)}, vector<string>{GI(0)});
824  }
825 };
826 REGISTER_GRADIENT(FlattenToVec, GetFlattenToVecGradient);
827 
828 struct GetCopyGradient : public GradientMakerBase {
829  using GradientMakerBase::GradientMakerBase;
830  vector<OperatorDef> GetGradientDefs() override {
831  return SingleGradientDef(
832  "CopyOnDeviceLike",
833  "",
834  vector<string>{GO(0), I(0)},
835  vector<string>{GI(0)});
836  }
837 };
838 REGISTER_GRADIENT(Copy, GetCopyGradient);
839 
840 struct GetGPUToCPUGradient : public GradientMakerBase {
841  using GradientMakerBase::GradientMakerBase;
842  vector<OperatorDef> GetGradientDefs() override {
843  if (g_output_[0].IsDense()) {
844  return SingleGradientDef(
845  "CopyCPUToGPU", "", vector<string>{GO(0)}, vector<string>{GI(0)});
846  } else {
847  return vector<OperatorDef>{CreateOperatorDef(
848  "CopyCPUToGPU",
849  "",
850  std::vector<string>{GO_I(0)},
851  std::vector<string>{GI_I(0)}),
852  CreateOperatorDef(
853  "CopyCPUToGPU",
854  "",
855  std::vector<string>{GO_V(0)},
856  std::vector<string>{GI_V(0)})};
857  }
858  }
859 };
860 REGISTER_GRADIENT(CopyGPUToCPU, GetGPUToCPUGradient);
861 
862 struct GetCPUToGPUGradient : public GradientMakerBase {
863  using GradientMakerBase::GradientMakerBase;
864  vector<OperatorDef> GetGradientDefs() override {
865  if (g_output_[0].IsDense()) {
866  return SingleGradientDef(
867  "CopyGPUToCPU", "", vector<string>{GO(0)}, vector<string>{GI(0)});
868  } else {
869  return vector<OperatorDef>{CreateOperatorDef(
870  "CopyGPUToCPU",
871  "",
872  std::vector<string>{GO_I(0)},
873  std::vector<string>{GI_I(0)}),
874  CreateOperatorDef(
875  "CopyGPUToCPU",
876  "",
877  std::vector<string>{GO_V(0)},
878  std::vector<string>{GI_V(0)})};
879  }
880  }
881 };
882 REGISTER_GRADIENT(CopyCPUToGPU, GetCPUToGPUGradient);
883 
884 SHOULD_NOT_DO_GRADIENT(Unique);
885 SHOULD_NOT_DO_GRADIENT(LengthsToSegmentIds);
886 SHOULD_NOT_DO_GRADIENT(SegmentIdsToLengths);
887 SHOULD_NOT_DO_GRADIENT(SegmentIdsToRanges);
888 SHOULD_NOT_DO_GRADIENT(SegmentIdsToLengthWeights);
889 SHOULD_NOT_DO_GRADIENT(GatherRangesOp);
890 SHOULD_NOT_DO_GRADIENT(LengthsGather);
891 SHOULD_NOT_DO_GRADIENT(AccumulateHistogram);
892 
893 template <>
894 bool NanCheckOp<CPUContext>::RunOnDevice() {
895  auto& X = Input(0);
896  auto* Y = Output(0);
897  const int D = X.size();
898  const float* data = X.data<float>();
899  ConstEigenVectorMap<float> input_data(data, D);
900 
901  bool all_finite = input_data.allFinite();
902 
903  if (!all_finite) {
904  std::cerr << "Tensor contained NaN or inf: [" << this->debug_def().input(0)
905  << "]" << std::endl;
906 
907  for (int j = 0; j < InputSize(); j++) {
908  std::cerr << "Tensor name: " << this->debug_def().input(j) << std::endl;
909  std::cerr << "Input tensor:" << std::endl;
910  tensorPrinter_.Print<float>(Input(j));
911  std::cerr << "NaN idxs:" << std::endl;
912  const float* x = Input(j).data<float>();
913  for (size_t i = 0; i < Input(j).size(); ++i) {
914  if (std::isnan(x[i]) || std::isinf(x[i])) {
915  std::cerr << i << " ";
916  }
917  }
918  std::cerr << std::endl;
919  }
920  return false;
921  }
922 
923  if (&X != Y) {
924  Y->CopyFrom(X, &context_);
925  }
926  return true;
927 }
928 REGISTER_CPU_OPERATOR(NanCheck, NanCheckOp<CPUContext>);
929 REGISTER_GRADIENT(NanCheck, GetNanCheckGradient);
930 
931 OPERATOR_SCHEMA(NanCheck)
932  .NumInputs(1, INT_MAX)
933  .NumOutputs(1)
934  .AllowInplace({{0, 0}})
935  .IdenticalTypeAndShapeOfInput(0)
936  .SetDoc("Identity operator, but checks all values for nan or inf")
937  .Input(0, "tensor", "Tensor to check for nan/inf")
938  .Output(
939  0,
940  "output",
941  "Tensor to copy input into if no NaNs or inf."
942  " Can be in-place");
943 
944 OPERATOR_SCHEMA(Size)
945  .NumInputs(1)
946  .NumOutputs(1)
947  .SetDoc(
948  "Return a 1D tensor of type int64 that contains the number "
949  "of elements of the input tensor")
950  .Input(0, "tensor", "Tensor to calculate number of elements")
951  .Output(
952  0,
953  "output",
954  "1D tensor of type int64 that contains the number of "
955  "elements in the input tensor.");
956 
957 REGISTER_CPU_OPERATOR(Size, SizeOp<CPUContext>);
958 NO_GRADIENT(Size);
959 
960 template <>
961 template <typename T>
962 bool RangeOp<CPUContext>::DoRunOnDevice(
963  const T& start,
964  const T& step,
965  Tensor<CPUContext>* output) {
966  auto* output_data = output->template mutable_data<T>();
967  for (int i = 0; i < output->size(); ++i) {
968  output_data[i] = i * step + start;
969  }
970  return true;
971 }
972 
973 OPERATOR_SCHEMA(Range)
974  .NumInputs(1, 3)
975  .NumOutputs(1)
976  .SetDoc(
977  "Values are generated within the half-open interval [start, stop) "
978  "(in other words, the interval including start but excluding stop). "
979  "When called with a single value, this will return `[0, v]` with the "
980  "result type inferred from the input types.")
981  .Input(
982  0,
983  "start",
984  "Optional scalar Tensor with the start of the interval (inclusive).")
985  .Input(1, "stop", "scalar Tensor with the end of the interval (exclusive)")
986  .Input(2, "step", "Optional scalar Tensor with spacing between values.")
987  .Output(
988  0,
989  "output",
990  "1D tensor of same type as inputs that contains the sequence.");
991 
992 REGISTER_CPU_OPERATOR(Range, RangeOp<CPUContext>);
993 NO_GRADIENT(Range);
994 
995 } // namespace caffe2
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...