1 #include "caffe2/operators/utility_ops.h" 8 bool WeightedSumOp<CPUContext>::RunOnDevice() {
9 return DoRunWithType<float>();
13 bool WeightedSumGradientOp<CPUContext>::RunOnDevice() {
14 return DoRunWithType<float>();
19 void UniqueOp<CPUContext>::DoRun() {
20 auto& inputTensor = Input(0);
22 int N = inputTensor.dim32(0);
23 CAFFE_ENFORCE_EQ(inputTensor.ndim(), 1,
"Input should be a vector");
24 auto* uniqueTensor = Output(UNIQUE);
26 int* remapping =
nullptr;
27 if (REMAPPING < OutputSize()) {
28 auto* remappingTensor = Output(REMAPPING);
29 remappingTensor->ResizeLike(inputTensor);
30 remapping = remappingTensor->template mutable_data<int>();
33 const T* input = inputTensor.template data<T>();
37 std::iota(order_.begin(), order_.end(), 0);
38 std::sort(order_.begin(), order_.end(), [input](
const int x,
const int y) {
39 return input[x] < input[y];
42 for (
int i = 1; i < N; ++i) {
43 K -= input[order_[i]] == input[order_[i - 1]];
45 uniqueTensor->Resize(K);
46 T* unique = uniqueTensor->template mutable_data<T>();
49 for (
int i = 0; i < N; ++i) {
50 if (i == 0 || prev != input[order_[i]]) {
51 prev = unique[K++] = input[order_[i]];
54 remapping[order_[i]] = K - 1;
59 REGISTER_CPU_OPERATOR(WallClockTime, WallClockTimeOp<CPUContext>);
60 REGISTER_CPU_OPERATOR(Print, PrintOp<CPUContext>);
61 REGISTER_CPU_OPERATOR(FlattenToVec, FlattenToVecOp<CPUContext>);
62 REGISTER_CPU_OPERATOR(Alias, AliasOp<CPUContext>);
63 REGISTER_CPU_OPERATOR(ResizeLike, ResizeLikeOp<CPUContext>);
64 REGISTER_CPU_OPERATOR(SumInt, SumOp<CPUContext>);
65 REGISTER_CPU_OPERATOR(WeightedSum, WeightedSumOp<CPUContext>);
66 REGISTER_CPU_OPERATOR(WeightedSumGradient, WeightedSumGradientOp<CPUContext>);
67 REGISTER_CPU_OPERATOR(
69 ScatterWeightedSumOp<float, CPUContext>);
70 REGISTER_CPU_OPERATOR(ScatterAssign, ScatterAssignOp<CPUContext>);
72 REGISTER_CPU_OPERATOR(
74 CopyOp<CPUContext, CPUContext, CPUContext>);
76 REGISTER_CPU_OPERATOR(
78 CopyOp<CPUContext, CPUContext, CPUContext>);
79 REGISTER_CPU_OPERATOR(
81 CopyOnDeviceLikeOp<CPUContext, CPUContext, CPUContext>);
82 REGISTER_CPU_OPERATOR(Copy, CopyOp<CPUContext, CPUContext, CPUContext>);
83 REGISTER_CPU_OPERATOR(LengthsToShape, LengthsToShapeOp<CPUContext>);
84 REGISTER_CPU_OPERATOR(HasElements, HasElementsOp<CPUContext>);
85 REGISTER_CPU_OPERATOR(IsEmpty, IsEmptyOp<CPUContext>);
86 REGISTER_CPU_OPERATOR(Gather, GatherOp<CPUContext>);
87 REGISTER_CPU_OPERATOR(GatherRanges, GatherRangesOp<CPUContext>);
88 REGISTER_CPU_OPERATOR(LengthsGather, LengthsGatherOp<CPUContext>);
89 REGISTER_CPU_OPERATOR(Unique, UniqueOp<CPUContext>);
90 REGISTER_CPU_OPERATOR(LengthsToSegmentIds, LengthsToSegmentIdsOp<CPUContext>);
91 REGISTER_CPU_OPERATOR(LengthsToRanges, LengthsToRangesOp<CPUContext>);
92 REGISTER_CPU_OPERATOR(SegmentIdsToLengths, SegmentIdsToLengthsOp<CPUContext>);
93 REGISTER_CPU_OPERATOR(SegmentIdsToRanges, SegmentIdsToRangesOp<CPUContext>);
94 REGISTER_CPU_OPERATOR(LengthsToWeights, LengthsToWeightsOp<CPUContext>);
95 REGISTER_CPU_OPERATOR(EnsureDense, EnsureDenseOp<CPUContext>);
96 REGISTER_CPU_OPERATOR(
98 AccumulateHistogramOp<float, CPUContext>);
100 OPERATOR_SCHEMA(WallClockTime)
103 .SetDoc(
"Time since epoch in nanoseconds.")
104 .Output(0,
"time",
"The time in nanoseconds.");
106 REGISTER_CPU_OPERATOR(UnsafeCoalesce, UnsafeCoalesceOp<CPUContext>);
108 OPERATOR_SCHEMA(Print)
111 .SetDoc(
"Logs shape and contents of input tensor to stderr or to a file.")
114 "(bool) if 1, saves contents to the root folder of the current " 115 "workspace, appending the tensor contents to a file named after " 116 "the blob name. Otherwise, logs to stderr.")
117 .Input(0,
"tensor",
"The tensor to print.");
119 OPERATOR_SCHEMA(LengthsToShape).NumInputs(1).NumOutputs(1);
121 OPERATOR_SCHEMA(FlattenToVec)
124 .TensorInferenceFunction([](
const OperatorDef& ,
125 const vector<TensorShape>& in) {
126 vector<TensorShape> out(1);
128 for (
auto d : in[0].dims()) {
131 out[0].set_data_type(in[0].data_type());
132 out[0].add_dims(total);
136 Flattens the input tensor into a 1D vector. 138 .Input(0, "input",
"A tensor of rank >= 1.")
142 "A tensor of rank 1 with the contents of the input tensor");
144 OPERATOR_SCHEMA(Alias)
147 .IdenticalTypeAndShape()
149 Makes the output and the input share the same underlying storage. 151 WARNING: in general, in caffe2's operator interface different tensors should 152 have different underlying storage, which is the assumption made by 153 components such as the dependency engine and memory optimization. Thus, in 154 normal situations you should not use the AliasOp, especially in a normal 155 forward-backward pass. 157 The Alias op is provided so one can achieve true asynchrony, such as 158 Hogwild, in a graph. But make sure you understand all the implications 159 similar to multi-thread computation before you use it explicitly. 161 .Input(0, "input",
"Input tensor whose storage will be shared.")
162 .Output(0,
"output",
"Tensor of same shape as input, sharing its storage.");
164 OPERATOR_SCHEMA(ResizeLike)
167 .TensorInferenceFunction([](
const OperatorDef& ,
168 const vector<TensorShape>& in) {
169 vector<TensorShape> out(1);
170 out.push_back(in[1]);
171 out[0].set_data_type(in[0].data_type());
175 Produces tensor containing data of first input and shape of second input. 177 .Input(0, "data",
"Tensor whose data will be copied into the output.")
178 .Input(1,
"shape_tensor",
"Tensor whose shape will be applied to output.")
179 .Output(0,
"output",
"Tensor with data of input 0 and shape of input 1.");
181 OPERATOR_SCHEMA(SumInt)
182 .NumInputs(1, INT_MAX)
184 .InputsCanCrossDevices()
185 .TensorInferenceFunction([](
const OperatorDef& ,
186 const vector<TensorShape>& in) {
187 vector<TensorShape> out(1);
188 out.push_back(in[0]);
189 out[0].set_data_type(TensorProto::INT32);
192 .AllowInplace({{0, 0}});
194 OPERATOR_SCHEMA(WeightedSum)
195 .NumInputs([](
int n) {
return (n > 0 && n % 2 == 0); })
197 .AllowInplace({{0, 0}})
198 .IdenticalTypeAndShapeOfInput(0)
200 Element-wise weighted sum of several data, weight tensor pairs. 201 Input should be in the form X_0, weight_0, X_1, weight_1, ... where X_i all 202 have the same shape, and weight_i are size 1 tensors that specifies the weight 203 of each vector. Note that if one wants to do in-place computation, it could 204 only be done with X_0 also as the output, but not other X_i. 206 .Input(0, "data_0",
"First of the input tensors.")
207 .Input(0,
"weight_0",
"Weight of the first input in the sum.")
208 .Output(0,
"output",
"Result containing weighted elem-wise sum of inputs.");
210 OPERATOR_SCHEMA(WeightedSumGradient)
211 .NumInputs([](
int n) {
return (n > 0 && n % 2 == 1); })
212 .NumOutputs(1, INT_MAX);
214 OPERATOR_SCHEMA(ScatterWeightedSum)
215 .NumInputs([](
int n) {
return (n > 3 && (n - 3) % 2 == 0); })
217 .EnforceInplace({{0, 0}})
219 Similar to WeightedSum, computes the weighted sum of several tensors, with 220 the difference that inputs are sliced tensors. The first tensor has to be 221 in-place and only slices of it on the first dimension as indexed by INDICES 224 Note: The op pretty much ignores the exact shapes of the input arguments and 225 cares only about sizes. It's done for performance consideration to avoid 226 unnecessary reshapes. Only first dimension of X_0 is important, let's call it 227 N. If M is the total size of X_0 and K is the size of INDICES then X_i is 228 assumed to be of shape K x (M / N) regardless of the real shape. 230 Note: Each update in INDICES is applied independently which means that if 231 duplicated elements are present in INDICES the corresponding slice of X_0 232 will be scaled multiple times. Manual collapsing of INDICES is required 233 beforehand if necessary. 235 Note: Updates are applied sequentially by inputs which might have undesired 236 consequences if the input tensor is accessed concurrently by different op 237 (e.g. when doing Hogwild). Other threads might see intermediate results even 238 on individual slice level, e.g. X_0 scaled by weight_0 but without any 241 Currently only works on CPU because of access to INDICES. 243 .Input(0, "X_0",
"Tensor to be updated.")
247 "Scalar weight for X_0, applied only to slices affected.")
251 "1-D list of indices on the first dimension of X_0 " 252 "that need to be updated")
253 .Input(3,
"X_1",
"Update slices, with shape len(INDICES) + shape(X_0)[1:]")
254 .Input(4,
"Weight_1",
"Scalar weight for X_1 update")
255 .Output(0,
"X_0",
"Has to be exactly the same tensor as the input 0")
256 .EnforceInplace({{0, 0}});
258 OPERATOR_SCHEMA(ScatterAssign)
261 .EnforceInplace({{0, 0}})
263 Update slices of the tensor in-place by overriding current value. 265 Note: The op pretty much ignores the exact shapes of the input arguments and 266 cares only about sizes. It's done for performance consideration to avoid 267 unnecessary reshapes. Only first dimension of X_0 is important, let's call it 268 N. If M is the total size of X_0 and K is the size of INDICES then X_i is 269 assumed to be of shape K x (M / N) regardless of the real shape. 271 Note: Each update in INDICES is applied independently which means that if 272 duplicated elements are present in INDICES arbitrary one will win. 274 Currently only works on CPU because of access to INDICES. 276 .Input(0, "DATA",
"Tensor to be updated.")
280 "1-D list of indices on the first dimension" 281 "of X_0 that need to be updated")
285 "Update slices, with shape len(INDICES) + shape(X_0)[1:]")
286 .Output(0,
"DATA",
"Has to be exactly the same tensor as the input 0");
288 OPERATOR_SCHEMA(Copy)
291 .IdenticalTypeAndShape()
292 .InputsCanCrossDevices()
293 .SetDoc(
"Copy input tensor into output, potentially across devices.")
294 .Input(0,
"input",
"The input tensor.")
295 .Output(0,
"output",
"Tensor that will contain a copy of the input.");
297 OPERATOR_SCHEMA(CopyGPUToCPU)
300 .IdenticalTypeAndShape()
301 .InputsCanCrossDevices()
302 .DeviceInferenceFunction([](
const OperatorDef& def) {
304 def.has_device_option(),
305 "CopyGPUToCPU op should have cuda device option.");
306 auto& cuda_option = def.device_option();
307 auto cpu_option = DeviceOption();
308 vector<DeviceOption> in_dev(def.input_size(), cuda_option);
309 vector<DeviceOption> out_dev(def.output_size(), cpu_option);
310 return std::make_pair(in_dev, out_dev);
313 Copy tensor for GPU to CPU context. Must be run under GPU device option. 315 .Input(0, "input",
"The input tensor.")
316 .Output(0,
"output",
"Tensor that will contain a copy of the input.");
318 OPERATOR_SCHEMA(CopyCPUToGPU)
321 .IdenticalTypeAndShape()
322 .InputsCanCrossDevices()
323 .DeviceInferenceFunction([](
const OperatorDef& def) {
325 def.has_device_option(),
326 "CopyCPUToGPU op should have cuda device option.");
327 auto& cuda_option = def.device_option();
328 auto cpu_option = DeviceOption();
329 vector<DeviceOption> in_dev(def.input_size(), cpu_option);
330 vector<DeviceOption> out_dev(def.output_size(), cuda_option);
331 return std::make_pair(in_dev, out_dev);
334 Copy tensor for CPU to GPU context. Must be run under GPU device option. 336 .Input(0, "input",
"The input tensor.")
337 .Output(0,
"output",
"Tensor that will contain a copy of the input.");
339 OPERATOR_SCHEMA(EnsureCPUOutput)
342 .IdenticalTypeAndShape()
343 .InputsCanCrossDevices()
344 .DeviceInferenceFunction([](
const OperatorDef& def) {
346 def.has_device_option() ? def.device_option() : DeviceOption();
347 auto cpu_option = DeviceOption();
348 vector<DeviceOption> in_dev(def.input_size(), op_device);
349 vector<DeviceOption> out_dev(def.output_size(), cpu_option);
350 return std::make_pair(in_dev, out_dev);
353 Take an input tensor in the current Context (GPU or CPU) and create an output 354 which is always a TensorCPU. This may involves cross-device MemCpy. 356 .Input(0, "input",
"The input CUDA or CPU tensor.")
357 .Output(0,
"output",
"TensorCPU that is a copy of the input.");
359 OPERATOR_SCHEMA(CopyFromCPUInput)
362 .IdenticalTypeAndShape()
363 .InputsCanCrossDevices()
364 .DeviceInferenceFunction([](
const OperatorDef& def) {
366 def.has_device_option() ? def.device_option() : DeviceOption();
367 auto cpu_option = DeviceOption();
368 vector<DeviceOption> in_dev(def.input_size(), cpu_option);
369 vector<DeviceOption> out_dev(def.output_size(), op_device);
370 return std::make_pair(in_dev, out_dev);
373 Take a CPU input tensor and copy it to an output in the current 374 Context (GPU or CPU). This may involves cross-device MemCpy. 376 .Input(0, "input",
"The input CPU tensor.")
377 .Output(0,
"output",
"either a TensorCUDA or a TensorCPU");
379 OPERATOR_SCHEMA(CopyOnDeviceLike)
382 .SetDoc(
"Copy input tensor into output to the specific device.")
383 .Input(0,
"input",
"The input tensor.")
384 .Input(1,
"dst",
"Tensor, on which device the copy will be performed.")
385 .Output(0,
"output",
"Tensor that will contain a copy of the input.");
387 OPERATOR_SCHEMA(HasElements)
390 .SetDoc(
"Returns true iff the input tensor has size > 0")
391 .Input(0,
"tensor",
"Tensor of any type.")
395 "Scalar bool tensor. True if input is not empty.");
397 OPERATOR_SCHEMA(IsEmpty)
400 .SetDoc(
"Returns true iff the input tensor has size == 0")
401 .ScalarType(::caffe2::TensorProto_DataType::TensorProto_DataType_BOOL)
402 .Input(0,
"tensor",
"Tensor of any type.")
403 .Output(0,
"is_empty",
"Scalar bool tensor. True if input is empty.");
405 OPERATOR_SCHEMA(Gather)
409 Given DATA tensor of rank r >= 1, and INDICES tensor of rank q, gather 410 entries of the outer-most dimension of DATA indexed by INDICES, and concatenate 411 them in an output tensor of rank q + (r - 1). 434 .Input(0, "DATA",
"Tensor of rank r >= 1.")
435 .Input(1,
"INDICES",
"Tensor of int32/int64 indices, of any rank q.")
436 .Output(0,
"OUTPUT",
"Tensor of rank q + (r - 1).")
437 .TensorInferenceFunction([](
const OperatorDef& def,
438 const vector<TensorShape>& in) {
439 vector<TensorShape> out(1);
440 for (
auto d : in[1].dims()) {
443 for (
int i = 1; i < in[0].dims_size(); ++i) {
444 out[0].add_dims(in[0].dims(i));
446 out[0].set_data_type(in[0].data_type());
450 OPERATOR_SCHEMA(GatherRanges)
454 Given DATA tensor of rank 1, and RANGES tensor of rank 3, gather 455 corresponding ranges into a 1-D tensor OUTPUT. 457 RANGES dimentions description: 458 1: represents list of examples within a batch 459 2: represents list features 460 3: two values which are start and length or a range (to be applied on DATA) 462 Another output LENGTHS represents each example length within OUTPUT 465 DATA = [1, 2, 3, 4, 5, 6] 476 OUTPUT = [1, 3, 4, 5, 6] 479 .Input(0, "DATA",
"Tensor of rank 1.")
483 "Tensor of int32/int64 ranges, of dims (N, M, 2). " 484 "Where N is number of examples and M is a size of each example. " 485 "Last dimension represents a range in the format (start, lengths)")
486 .Output(0,
"OUTPUT",
"1-D tensor of size sum of range lengths")
490 "1-D tensor of size N with lengths over gathered data" 491 " for each row in a batch. sum(LENGTHS) == OUTPUT.size()")
492 .TensorInferenceFunction([](
const OperatorDef& ,
493 const vector<TensorShape>& in) {
494 std::vector<TensorShape> out(2);
497 for (
auto d : in[0].dims()) {
500 out[0].add_dims(total);
501 out[0].set_data_type(in[0].data_type());
502 out[1].add_dims(in[1].dims(0));
503 out[1].set_data_type(in[1].data_type());
507 OPERATOR_SCHEMA(LengthsGather)
511 Gather items from sparse tensor. Sparse tensor is described by items and 512 lengths. This operator gathers items corresponding to lengths at the given 513 indices. This deliberately doesn't return lengths of OUTPUTS so that both lists 514 and maps can be supported without special cases. If you need lengths tensor for 515 OUTPUT, use `Gather`. 518 ITEMS = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] 519 LENGTHS = [0, 2, 3, 1, 4] 522 OUTPUT = [2, 3, 4, 6, 7, 8, 9] 524 .Input(0, "ITEMS",
"items tensor")
525 .Input(1,
"LENGTHS",
"lengths tensor")
526 .Input(2,
"INDICES",
"indices into LENGTHS where items should be gathered")
527 .Output(0,
"OUTPUT",
"1-D tensor containing gathered items");
529 OPERATOR_SCHEMA(Unique)
533 Deduplicates input indices vector and optionally produces reverse remapping. 534 There's no guarantees on the ordering of the output indices. 536 .Input(0, "indices",
"1D tensor of int32 or int64 indices.")
537 .Output(0,
"unique_indices",
"1D tensor of deduped entries.")
541 "(optional) mapping from `indices` to `unique_indices`. This has the " 542 "same shape as `indices`. Its elements are the indices into " 543 "`unique_indices` such that `Gather(['unique_indices', 'remapping'])` " 545 .TensorInferenceFunction([](
const OperatorDef& def,
546 const vector<TensorShape>& in) {
547 std::vector<TensorShape> out(1);
548 out[0].set_data_type(in[0].data_type());
549 CAFFE_ENFORCE_EQ(in[0].dims_size(), 1);
550 if (in[0].dims(0) <= 1) {
554 out[0].add_dims(in[0].dims(0));
556 out[0].set_unknown_shape(
true);
558 if (def.output_size() > 1) {
560 out.push_back(in[0]);
561 out.back().set_data_type(TensorProto::INT32);
566 OPERATOR_SCHEMA(LengthsToSegmentIds)
570 Given a vector of segment lengths, returns a zero-based, consecutive vector 571 of segment_ids. For example, [1, 3, 0, 2] will produce [0, 1, 1, 1, 3, 3]. 572 In general, the inverse operation is SegmentIdsToLengths. Notice though that 573 trailing empty sequence lengths can't be properly recovered from segment ids. 575 .Input(0, "lengths",
"1D tensor of int32 or int64 segment lengths.")
576 .Output(0,
"segment_ids",
"1D tensor of length `sum(lengths)`");
578 OPERATOR_SCHEMA(LengthsToRanges)
582 Given a vector of segment lengths, calculates offsets of each segment and packs 583 them next to the lengths. For the input vector of length N the output is a Nx2 584 matrix with (offset, lengths) packaged for each segment. 586 For example, `[1, 3, 0, 2]` transforms into `[[0, 1], [1, 3], [4, 0], [4, 2]]`. 588 .Input(0, "lengths",
"1D tensor of int32 segment lengths.")
592 "2D tensor of shape len(lengths) X 2 and the same type as `lengths`");
594 OPERATOR_SCHEMA(SegmentIdsToLengths)
598 Transfers a vector of segment ids to a vector of segment lengths. This operation 599 supports non-consecutive segment ids. Segments not appearing in the input vector 600 will have length 0. If the second input is provided, the number of segments = 601 the size of its first dimension. Otherwise, the number of segments = the last 602 index in the first input vector + 1. 604 In general, for consecutive, zero-based segment IDs, this is the inverse 605 operation of LengthsToSegmentIds, except that a vector of segment IDs 606 cannot represent empty segments at the end (if the second input is absent). 608 .Input(0, "segment_ids",
"1-D int32_t or int64_t tensor of segment ids")
612 "if provided, number of segments = the size of its first dimension")
613 .Output(0,
"lengths",
"1-D int64_t tensor of segment lengths");
615 OPERATOR_SCHEMA(SegmentIdsToRanges)
619 Transfers a vector of segment ids to a vector of segment ranges. This operation 620 supports non-consecutive segment ids. Segments not appearing in the input vector 621 will have length 0. If the second input is provided, the number of segments = 622 the size of its first dimension. Otherwise, the number of segments = the last 623 index in the first input vector + 1. 625 .Input(0, "segment_ids",
"1-D int32_t or int64_t tensor of segment ids")
629 "if provided, number of segments = the size of its first dimension")
630 .Output(0,
"lengths",
"1-D int64_t tensor of segment lengths");
632 OPERATOR_SCHEMA(LengthsToWeights)
635 .Arg(
"power",
"n of 1/pow(length,n) for normalization")
637 Similar as LengthsToSegmentIds but output vector of segment 638 weights derived by lengths. i.e 1/pow(length, power) 640 .Input(0, "lengths",
"1-D int32_t or int64_t tensor of lengths")
641 .Output(0,
"a vector of weights",
"1-D float tensor of weights by length");
645 SHOULD_NOT_DO_GRADIENT(WallClockTime);
647 OPERATOR_SCHEMA(UnsafeCoalesce)
648 .NumInputsOutputs([](
int inputs,
int outputs) {
649 return inputs + 1 == outputs;
651 .AllowInplace([](
int input,
int output) {
return input == output; })
653 Coalesce the N inputs into N outputs and a single coalesced output blob. 655 This allows operations that operate over multiple small kernels (e.g. 656 biases in a deep CNN) to be coalesced into a single larger operation, 657 amortizing the kernel launch overhead, synchronization costs for 658 distributed computation, etc. 662 - computes the total size of the coalesced blob by summing the input sizes 663 - allocates the coalesced output blob as the total size 664 - copies the input vectors into the coalesced blob, at the correct offset. 665 - aliases each Output(i) to- point into the coalesced blob, at the corresponding offset for Input(i). 667 This is 'unsafe' as the output vectors are aliased, so use with 672 OPERATOR_SCHEMA(EnsureDense) 675 .AllowInplace({{0, 0}}) 676 .IdenticalTypeAndShape() 678 This operator converts dense or sparse gradients to dense ones. 679 Therefore, sparse gradient can be back propagated to Operators that consume 680 dense gradients only (e.g., FCGradient). 682 The operator's behaviors: 684 - In forward, simply pass in place or copy input to the output. 685 - In backward, if the gradient passed-in is sparse gradient, change it to dense gradient in linear time; otherwise, simply pass the dense gradient. 687 .Input(0, "input",
"Input tensors.")
688 .Output(0,
"output",
"Output tensor. Same dimension as inputs.");
690 OPERATOR_SCHEMA(AccumulateHistogram)
694 This operator calculate thes histogram of values in input tensor. 695 There're 2 outputs, one for histogram of current input tensor, and another 696 for histogram of the all input tensors accumulated through history. 697 The output would contain num_buckets + 2 values. index[1 ... num_buckets] 698 for values in [lower_bound, upper_bound) interval. And the rest 2 for values 699 smaller than lower_bound or greater than upper_bound respectively. 701 .Input(0, "X",
"Input tensor.")
702 .Output(0,
"CurHist",
"Output histogram of the current tensor.")
703 .Output(1,
"AccHist",
"Accumulated histogram of the history tensor.")
704 .Arg(
"lower_bound",
"the lower bound value")
705 .Arg(
"upper_bound",
"the upper bound value")
708 "number of buckets to use in [lower_bound, upper_bound)");
710 class GetEnsureDenseGradient :
public GradientMakerBase {
711 using GradientMakerBase::GradientMakerBase;
712 vector<OperatorDef> GetGradientDefs()
override {
714 GradOut(0).IsSparse() || GradOut(0).IsDense(),
717 " should be either sparse or dense.");
719 if (GradOut(0).IsDense()) {
721 return vector<OperatorDef>();
723 return SingleGradientDef(
726 vector<string>{GO_I(0), GO_V(0), I(0)},
727 vector<string>{GI(0)});
731 REGISTER_GRADIENT(EnsureDense, GetEnsureDenseGradient);
733 SHOULD_NOT_DO_GRADIENT(Print);
734 SHOULD_NOT_DO_GRADIENT(HasElements);
735 SHOULD_NOT_DO_GRADIENT(IsEmpty);
736 SHOULD_NOT_DO_GRADIENT(LengthsToShape);
737 SHOULD_NOT_DO_GRADIENT(UnsafeCoalesce);
739 class GetAliasGradient :
public GradientMakerBase {
740 using GradientMakerBase::GradientMakerBase;
741 vector<OperatorDef> GetGradientDefs()
override {
745 return vector<OperatorDef>();
748 REGISTER_GRADIENT(Alias, GetAliasGradient);
750 SHOULD_NOT_DO_GRADIENT(ResizeLike);
752 class GetSumGradient :
public GradientMakerBase {
753 using GradientMakerBase::GradientMakerBase;
754 vector<OperatorDef> GetGradientDefs()
override {
755 for (
auto i = 0; i < def_.input_size(); ++i) {
758 return vector<OperatorDef>();
761 REGISTER_GRADIENT(Sum, GetSumGradient);
763 SHOULD_NOT_DO_GRADIENT(ScatterWeightedSum);
764 SHOULD_NOT_DO_GRADIENT(ScatterAssign);
766 class GetWeightedSumGradient :
public GradientMakerBase {
767 using GradientMakerBase::GradientMakerBase;
768 vector<OperatorDef> GetGradientDefs()
override {
769 ArgumentHelper argsHelper(def_);
770 const bool grad_on_w = argsHelper.GetSingleArgument<
bool>(
"grad_on_w", 0);
772 auto inputs = vector<string>{GO(0)};
773 auto outputs = vector<string>();
774 for (
int i = 0; i < def_.input_size(); i += 2) {
775 inputs.push_back(I(i));
776 inputs.push_back(I(i + 1));
777 outputs.push_back(GI(i));
781 for (
int i = 0; i < def_.input_size(); i += 2) {
782 outputs.push_back(GI(i + 1));
786 return SingleGradientDef(
"WeightedSumGradient",
"", inputs, outputs);
789 REGISTER_GRADIENT(WeightedSum, GetWeightedSumGradient);
791 class GetGatherGradient :
public GradientMakerBase {
792 using GradientMakerBase::GradientMakerBase;
793 vector<OperatorDef> GetGradientDefs()
override {
794 ArgumentHelper argsHelper(def_);
795 const bool dense_gradient =
796 argsHelper.GetSingleArgument<
bool>(
"dense_gradient",
false);
798 using Op = GatherOp<CPUContext>;
800 if (dense_gradient) {
801 return vector<OperatorDef>{CreateOperatorDef(
804 vector<string>{I(Op::INDICES), GO(0), I(Op::DATA)},
805 vector<string>{GI(Op::DATA)})};
812 SetSparse(Op::DATA, I(Op::INDICES), GO(0));
813 return vector<OperatorDef>();
817 REGISTER_GRADIENT(Gather, GetGatherGradient);
819 struct GetFlattenToVecGradient :
public GradientMakerBase {
820 using GradientMakerBase::GradientMakerBase;
821 vector<OperatorDef> GetGradientDefs()
override {
822 return SingleGradientDef(
823 "ResizeLike",
"", vector<string>{GO(0), I(0)}, vector<string>{GI(0)});
826 REGISTER_GRADIENT(FlattenToVec, GetFlattenToVecGradient);
828 struct GetCopyGradient :
public GradientMakerBase {
829 using GradientMakerBase::GradientMakerBase;
830 vector<OperatorDef> GetGradientDefs()
override {
831 return SingleGradientDef(
834 vector<string>{GO(0), I(0)},
835 vector<string>{GI(0)});
838 REGISTER_GRADIENT(Copy, GetCopyGradient);
840 struct GetGPUToCPUGradient :
public GradientMakerBase {
841 using GradientMakerBase::GradientMakerBase;
842 vector<OperatorDef> GetGradientDefs()
override {
843 if (g_output_[0].IsDense()) {
844 return SingleGradientDef(
845 "CopyCPUToGPU",
"", vector<string>{GO(0)}, vector<string>{GI(0)});
847 return vector<OperatorDef>{CreateOperatorDef(
850 std::vector<string>{GO_I(0)},
851 std::vector<string>{GI_I(0)}),
855 std::vector<string>{GO_V(0)},
856 std::vector<string>{GI_V(0)})};
860 REGISTER_GRADIENT(CopyGPUToCPU, GetGPUToCPUGradient);
862 struct GetCPUToGPUGradient :
public GradientMakerBase {
863 using GradientMakerBase::GradientMakerBase;
864 vector<OperatorDef> GetGradientDefs()
override {
865 if (g_output_[0].IsDense()) {
866 return SingleGradientDef(
867 "CopyGPUToCPU",
"", vector<string>{GO(0)}, vector<string>{GI(0)});
869 return vector<OperatorDef>{CreateOperatorDef(
872 std::vector<string>{GO_I(0)},
873 std::vector<string>{GI_I(0)}),
877 std::vector<string>{GO_V(0)},
878 std::vector<string>{GI_V(0)})};
882 REGISTER_GRADIENT(CopyCPUToGPU, GetCPUToGPUGradient);
884 SHOULD_NOT_DO_GRADIENT(Unique);
885 SHOULD_NOT_DO_GRADIENT(LengthsToSegmentIds);
886 SHOULD_NOT_DO_GRADIENT(SegmentIdsToLengths);
887 SHOULD_NOT_DO_GRADIENT(SegmentIdsToRanges);
888 SHOULD_NOT_DO_GRADIENT(SegmentIdsToLengthWeights);
889 SHOULD_NOT_DO_GRADIENT(GatherRangesOp);
890 SHOULD_NOT_DO_GRADIENT(LengthsGather);
891 SHOULD_NOT_DO_GRADIENT(AccumulateHistogram);
894 bool NanCheckOp<CPUContext>::RunOnDevice() {
897 const int D = X.size();
898 const float* data = X.data<
float>();
899 ConstEigenVectorMap<float> input_data(data, D);
901 bool all_finite = input_data.allFinite();
904 std::cerr <<
"Tensor contained NaN or inf: [" << this->debug_def().input(0)
907 for (
int j = 0; j < InputSize(); j++) {
908 std::cerr <<
"Tensor name: " << this->debug_def().input(j) << std::endl;
909 std::cerr <<
"Input tensor:" << std::endl;
910 tensorPrinter_.Print<
float>(Input(j));
911 std::cerr <<
"NaN idxs:" << std::endl;
912 const float* x = Input(j).data<
float>();
913 for (
size_t i = 0; i < Input(j).size(); ++i) {
914 if (std::isnan(x[i]) || std::isinf(x[i])) {
915 std::cerr << i <<
" ";
918 std::cerr << std::endl;
924 Y->CopyFrom(X, &context_);
928 REGISTER_CPU_OPERATOR(NanCheck, NanCheckOp<CPUContext>);
929 REGISTER_GRADIENT(NanCheck, GetNanCheckGradient);
931 OPERATOR_SCHEMA(NanCheck)
932 .NumInputs(1, INT_MAX)
934 .AllowInplace({{0, 0}})
935 .IdenticalTypeAndShapeOfInput(0)
936 .SetDoc(
"Identity operator, but checks all values for nan or inf")
937 .Input(0,
"tensor",
"Tensor to check for nan/inf")
941 "Tensor to copy input into if no NaNs or inf." 944 OPERATOR_SCHEMA(Size)
948 "Return a 1D tensor of type int64 that contains the number " 949 "of elements of the input tensor")
950 .Input(0,
"tensor",
"Tensor to calculate number of elements")
954 "1D tensor of type int64 that contains the number of " 955 "elements in the input tensor.");
957 REGISTER_CPU_OPERATOR(Size, SizeOp<CPUContext>);
961 template <
typename T>
962 bool RangeOp<CPUContext>::DoRunOnDevice(
966 auto* output_data = output->template mutable_data<T>();
967 for (
int i = 0; i < output->size(); ++i) {
968 output_data[i] = i * step + start;
973 OPERATOR_SCHEMA(Range)
977 "Values are generated within the half-open interval [start, stop) " 978 "(in other words, the interval including start but excluding stop). " 979 "When called with a single value, this will return `[0, v]` with the " 980 "result type inferred from the input types.")
984 "Optional scalar Tensor with the start of the interval (inclusive).")
985 .Input(1,
"stop",
"scalar Tensor with the end of the interval (exclusive)")
986 .Input(2,
"step",
"Optional scalar Tensor with spacing between values.")
990 "1D tensor of same type as inputs that contains the sequence.");
992 REGISTER_CPU_OPERATOR(Range, RangeOp<CPUContext>);
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...