Caffe2 - C++ API
A deep learning, cross platform ML framework
cross_entropy_op.cc
1 #include "caffe2/operators/cross_entropy_op.h"
2 
3 namespace caffe2 {
4 
5 namespace {
6 
7 inline float sigmoid_xent_forward(float lgt, float tgt) {
8  return lgt * (tgt - (lgt >= 0)) - log(1 + exp(lgt - 2 * lgt * (lgt >= 0)));
9 }
10 
11 inline float sigmoid_xent_backward(float lgt, float tgt) {
12  return tgt - 1. / (1. + exp(-lgt));
13 }
14 }
15 
16 template <>
17 bool LabelCrossEntropyOp<float, CPUContext>::RunOnDevice() {
18  auto& X = Input(0);
19  auto& label = Input(1);
20  auto* Y = Output(0);
21  int N, D;
22  if (X.ndim() > 1) {
23  N = X.dim32(0);
24  D = X.size_from_dim(1);
25  } else {
26  N = 1;
27  D = X.dim32(0);
28  }
29  CAFFE_ENFORCE(
30  (label.ndim() == 1) || (label.ndim() == 2 && label.dim32(1) == 1));
31  CAFFE_ENFORCE_EQ(label.dim32(0), N);
32  Y->Resize(N);
33  const auto* Xdata = X.data<float>();
34  const auto* labelData = label.data<int>();
35  auto* Ydata = Y->mutable_data<float>();
36  CAFFE_ENFORCE(
37  (ConstEigenVectorArrayMap<int>(labelData, N) < D).all() &&
38  (ConstEigenVectorArrayMap<int>(labelData, N) >= 0).all(),
39  "Label seems to be outside of supported range. Supported labels are in "
40  "range [0,",
41  D,
42  ")");
43  for (int i = 0; i < N; ++i) {
44  Ydata[i] = -log(std::max(Xdata[i * D + labelData[i]], kLOG_THRESHOLD()));
45  }
46  return true;
47 }
48 
49 template <>
50 bool SigmoidCrossEntropyWithLogitsOp<float, CPUContext>::RunOnDevice() {
51  auto& logits = Input(0);
52  auto& targets = Input(1);
53  CAFFE_ENFORCE_EQ(logits.dims(), targets.dims());
54  const auto inner_size = logits.ndim() > 0 ? logits.dims().back() : 1;
55  const auto outer_size = logits.size() / inner_size;
56 
57  auto* out = Output(0);
58  if (logits.ndim() == 0) {
59  out->Resize(std::vector<TIndex>{});
60  } else {
61  std::vector<TIndex> dims(logits.dims().begin(), logits.dims().end() - 1);
62  out->Resize(dims);
63  }
64  auto* out_ptr = out->mutable_data<float>();
65 
66  auto* logits_ptr = logits.data<float>();
67  auto* targets_ptr = targets.data<float>();
68 
69  auto in_idx = 0;
70  for (int i = 0; i < outer_size; ++i) {
71  float value = 0;
72  for (int j = 0; j < inner_size; ++j) {
73  value += sigmoid_xent_forward(logits_ptr[in_idx], targets_ptr[in_idx]);
74  ++in_idx;
75  }
76  out_ptr[i] = -value / inner_size;
77  }
78  return true;
79 }
80 
81 template <>
82 bool SigmoidCrossEntropyWithLogitsGradientOp<float, CPUContext>::RunOnDevice() {
83  auto& g = Input(0);
84  auto& logits = Input(1);
85  auto& targets = Input(2);
86  CAFFE_ENFORCE(logits.dims() == targets.dims());
87  const auto inner_size = logits.ndim() > 0 ? logits.dims().back() : 1;
88  const auto outer_size = logits.size() / inner_size;
89  CAFFE_ENFORCE(g.size() == outer_size);
90 
91  auto* out = Output(0);
92  out->ResizeLike(logits);
93  auto* out_ptr = out->mutable_data<float>();
94 
95  auto* logits_ptr = logits.data<float>();
96  auto* targets_ptr = targets.data<float>();
97  auto* g_ptr = g.data<float>();
98 
99  auto in_idx = 0;
100  for (int i = 0; i < outer_size; ++i) {
101  auto g_factor = -g_ptr[i] / inner_size;
102  for (int j = 0; j < inner_size; ++j) {
103  out_ptr[in_idx] = g_factor *
104  sigmoid_xent_backward(logits_ptr[in_idx], targets_ptr[in_idx]);
105  ++in_idx;
106  }
107  }
108  return true;
109 }
110 
111 template <>
112 bool WeightedSigmoidCrossEntropyWithLogitsOp<float, CPUContext>::RunOnDevice() {
113  auto& logits = Input(0);
114  auto& targets = Input(1);
115  auto& weights = Input(2);
116  CAFFE_ENFORCE(logits.dims() == targets.dims());
117  CAFFE_ENFORCE(weights.dims() == targets.dims());
118  const auto inner_size = logits.ndim() > 0 ? logits.dims().back() : 1;
119  const auto outer_size = logits.size() / inner_size;
120 
121  auto* out = Output(0);
122  if (logits.ndim() == 0) {
123  out->Resize(std::vector<TIndex>{});
124  } else {
125  std::vector<TIndex> dims(logits.dims().begin(), logits.dims().end() - 1);
126  out->Resize(dims);
127  }
128  auto* out_ptr = out->mutable_data<float>();
129 
130  auto* logits_ptr = logits.data<float>();
131  auto* targets_ptr = targets.data<float>();
132  auto* weights_ptr = weights.data<float>();
133 
134  auto in_idx = 0;
135  for (int i = 0; i < outer_size; ++i) {
136  float value = 0;
137  for (int j = 0; j < inner_size; ++j) {
138  value += sigmoid_xent_forward(logits_ptr[in_idx], targets_ptr[in_idx]) *
139  weights_ptr[in_idx];
140  ++in_idx;
141  }
142  out_ptr[i] = -value / inner_size;
143  }
144  return true;
145 }
146 
147 template <>
148 bool WeightedSigmoidCrossEntropyWithLogitsGradientOp<float, CPUContext>::
149  RunOnDevice() {
150  auto& g = Input(0);
151  auto& logits = Input(1);
152  auto& targets = Input(2);
153  auto& weights = Input(3);
154  CAFFE_ENFORCE(logits.dims() == targets.dims());
155  CAFFE_ENFORCE(weights.dims() == targets.dims());
156  const auto inner_size = logits.ndim() > 0 ? logits.dims().back() : 1;
157  const auto outer_size = logits.size() / inner_size;
158  CAFFE_ENFORCE(g.size() == outer_size);
159 
160  auto* out = Output(0);
161  out->ResizeLike(logits);
162  auto* out_ptr = out->mutable_data<float>();
163 
164  auto* logits_ptr = logits.data<float>();
165  auto* targets_ptr = targets.data<float>();
166  auto* weights_ptr = weights.data<float>();
167  auto* g_ptr = g.data<float>();
168 
169  auto in_idx = 0;
170  for (int i = 0; i < outer_size; ++i) {
171  auto g_factor = -g_ptr[i] / inner_size;
172  for (int j = 0; j < inner_size; ++j) {
173  out_ptr[in_idx] = g_factor *
174  sigmoid_xent_backward(logits_ptr[in_idx], targets_ptr[in_idx]) *
175  weights_ptr[in_idx];
176  ++in_idx;
177  }
178  }
179  return true;
180 }
181 
182 template <>
183 bool LabelCrossEntropyGradientOp<float, CPUContext>::RunOnDevice() {
184  auto& X = Input(0);
185  auto& label = Input(1);
186  auto& dY = Input(2);
187  auto* dX = Output(0);
188  int N, D;
189  if (X.ndim() > 1) {
190  N = X.dim32(0);
191  D = X.size_from_dim(1);
192  } else {
193  N = 1;
194  D = X.dim32(0);
195  }
196  CAFFE_ENFORCE(
197  (label.ndim() == 1) || (label.ndim() == 2 && label.dim32(1) == 1));
198  CAFFE_ENFORCE_EQ(label.dim32(0), N);
199  CAFFE_ENFORCE_EQ(dY.ndim(), 1);
200  CAFFE_ENFORCE_EQ(dY.dim32(0), N);
201  dX->ResizeLike(X);
202  math::Set<float, CPUContext>(dX->size(), 0.f, dX->mutable_data<float>(),
203  &context_);
204  const float* Xdata = X.data<float>();
205  const float* dYdata = dY.data<float>();
206  const int* labelData = label.data<int>();
207  float* dXdata = dX->mutable_data<float>();
208  for (int i = 0; i < N; ++i) {
209  dXdata[i * D + labelData[i]] =
210  - dYdata[i] / std::max(Xdata[i * D + labelData[i]], kLOG_THRESHOLD());
211  }
212  return true;
213 }
214 
215 template <>
216 bool MakeTwoClassOp<float, CPUContext>::RunOnDevice() {
217  auto& X = Input(0);
218  auto* Y = Output(0);
219  auto shape = X.dims();
220  shape.push_back(2);
221  TIndex N = X.size();
222  Y->Resize(shape);
223  const auto* Xdata = X.data<float>();
224  auto* Ydata = Y->mutable_data<float>();
225  for (TIndex i = 0; i < N; ++i) {
226  DCHECK_GE(Xdata[i], 0.0);
227  DCHECK_LE(Xdata[i], 1.0);
228  Ydata[i * 2] = 1.0 - Xdata[i];
229  Ydata[i * 2 + 1] = Xdata[i];
230  }
231  return true;
232 }
233 
234 template <>
235 bool MakeTwoClassGradientOp<float, CPUContext>::RunOnDevice() {
236  auto& dY = Input(0);
237  auto* dX = Output(0);
238  auto shape = dY.dims();
239  CAFFE_ENFORCE_GE(shape.size(), 1);
240  CAFFE_ENFORCE_EQ(shape.back(), 2);
241  shape.pop_back();
242  dX->Resize(shape);
243  const float* dYdata = dY.data<float>();
244  float* dXdata = dX->mutable_data<float>();
245  TIndex N = dX->size();
246  // use eigen?
247  for (TIndex i = 0; i < N; ++i) {
248  dXdata[i] = dYdata[i * 2 + 1] - dYdata[i * 2];
249  }
250  return true;
251 }
252 
253 template <>
254 bool CrossEntropyOp<float, CPUContext>::RunOnDevice() {
255  auto& X = Input(0);
256  auto& label = Input(1);
257  auto* Y = Output(0);
258  int N, D;
259  if (X.ndim() > 1) {
260  N = X.dim32(0);
261  D = X.size_from_dim(1);
262  } else {
263  N = 1;
264  D = X.dim32(0);
265  }
266  CAFFE_ENFORCE(
267  (label.ndim() == 1) || (label.ndim() == 2 && label.dim32(1) == D));
268  CAFFE_ENFORCE_EQ(label.dim32(0), N);
269  Y->Resize(vector<TIndex>{N});
270  const float* Xdata = X.data<float>();
271  const float* labelData = label.data<float>();
272  auto* Ydata = Y->mutable_data<float>();
273  CAFFE_ENFORCE(
274  (ConstEigenArrayMap<float>(labelData, D, N) <= 1.0f).all() &&
275  (ConstEigenArrayMap<float>(labelData, D, N) >= 0.0f).all(),
276  "Soft label seems incorrect: label value should be a probability ",
277  "between 0 and 1.0. You may be using the wrong cross entropy operator; ",
278  "use LabelCrossEntropy if the labels are integers whose values are at ",
279  "most the number of classes, ",
280  D,
281  ".");
282  EigenArrayMap<float>(Ydata, 1, N) =
283  -(ConstEigenArrayMap<float>(labelData, D, N) *
284  ConstEigenArrayMap<float>(Xdata, D, N).cwiseMax(kLOG_THRESHOLD()).log())
285  .colwise()
286  .sum();
287  return true;
288 }
289 
290 template <>
291 bool CrossEntropyGradientOp<float, CPUContext>::RunOnDevice() {
292  auto& X = Input(0);
293  auto& label = Input(1);
294  auto& dY = Input(2);
295  auto* dX = Output(0);
296  int N, D;
297  if (X.ndim() > 1) {
298  N = X.dim32(0);
299  D = X.size_from_dim(1);
300  } else {
301  N = 1;
302  D = X.dim32(0);
303  }
304  CAFFE_ENFORCE(
305  (label.ndim() == 1) || (label.ndim() == 2 && label.dim32(1) == D));
306  CAFFE_ENFORCE_EQ(label.dim32(0), N);
307  CAFFE_ENFORCE_EQ(dY.ndim(), 1);
308  CAFFE_ENFORCE_EQ(dY.dim32(0), N);
309  dX->ResizeLike(X);
310  math::Set<float, CPUContext>(
311  dX->size(), 0.f, dX->mutable_data<float>(), &context_);
312  const float* Xdata = X.data<float>();
313  const float* dYdata = dY.data<float>();
314  const float* labelData = label.data<float>();
315  float* dXdata = dX->mutable_data<float>();
316  EigenArrayMap<float>(dXdata, D, N) =
317  (ConstEigenArrayMap<float>(labelData, D, N) /
318  ConstEigenArrayMap<float>(Xdata, D, N).cwiseMax(kLOG_THRESHOLD()))
319  .rowwise() *
320  (-ConstEigenVectorArrayMap<float>(dYdata, N).transpose());
321  return true;
322 }
323 
324 REGISTER_CPU_OPERATOR(LabelCrossEntropy,
325  LabelCrossEntropyOp<float, CPUContext>);
326 REGISTER_CPU_OPERATOR(LabelCrossEntropyGradient,
327  LabelCrossEntropyGradientOp<float, CPUContext>);
328 
329 OPERATOR_SCHEMA(LabelCrossEntropy)
330  .NumInputs(2)
331  .NumOutputs(1)
332  .IdenticalTypeAndShapeOfInputDim(0, 0)
333  .SetDoc(R"DOC(
334 Operator computes the cross entropy between the input and the label set. In
335  practice, it is most commonly used at the end of models, after the SoftMax
336  operator and before the AveragedLoss operator. Note that LabelCrossEntropy
337  assumes that the label provided is either a 1D array of size N (batch size), or
338  a 2D array of size N x 1 (batch size). Each entry in the label vector indicates
339  which is the correct class; as such, each entry must be between 0 and D - 1,
340  inclusive, where D is the total number of classes. The formula used is:
341 
342  Y[i] = -log(X[i][j])
343 
344  where (i, j) is the classifier's prediction of the jth class (the correct one),
345  and i is the batch size. Each log has a lower limit for numerical stability.
346 )DOC")
347  .Input(
348  0,
349  "X",
350  "Input blob from the previous layer, which is almost always "
351  "the result of a softmax operation; X is a 2D array of size N x D, where N "
352  "is the batch size and D is the number of classes")
353  .Input(1, "label", "Blob containing the labels used to compare the input")
354  .Output(0, "Y", "Output blob after the cross entropy computation");
355 OPERATOR_SCHEMA(LabelCrossEntropyGradient)
356  .NumInputs(3)
357  .NumOutputs(1);
358 
360  using GradientMakerBase::GradientMakerBase;
361  vector<OperatorDef> GetGradientDefs() override {
362  return SingleGradientDef(
363  "LabelCrossEntropyGradient", "",
364  vector<string>{I(0), I(1), GO(0)},
365  vector<string>{GI(0)});
366  }
367 };
368 REGISTER_GRADIENT(LabelCrossEntropy, GetLabelCrossEntropyGradient);
369 
370 REGISTER_CPU_OPERATOR(MakeTwoClass,
372 REGISTER_CPU_OPERATOR(MakeTwoClassGradient,
374 
375 REGISTER_CPU_OPERATOR(
376  SigmoidCrossEntropyWithLogits,
378 REGISTER_CPU_OPERATOR(
379  SigmoidCrossEntropyWithLogitsGradient,
381 
382 REGISTER_CPU_OPERATOR(
383  WeightedSigmoidCrossEntropyWithLogits,
385 REGISTER_CPU_OPERATOR(
386  WeightedSigmoidCrossEntropyWithLogitsGradient,
388 
389 OPERATOR_SCHEMA(MakeTwoClass)
390  .NumInputs(1)
391  .NumOutputs(1)
392  .TensorInferenceFunction(
393  [](const OperatorDef& /* unused */, const vector<TensorShape>& in) {
394  vector<TensorShape> out(1);
395  out[0].add_dims(in[0].dims(0));
396  out[0].add_dims(2);
397  return out;
398  })
399  .SetDoc(R"DOC(
400 Given a vector of probabilities, this operator transforms this into a 2-column
401  matrix with complimentary probabilities for binary classification. In explicit
402  terms, given the vector X, the output Y is vstack(1 - X, X).
403  )DOC")
404  .Input(0, "X", "Input vector of probabilities")
405  .Output(
406  0,
407  "Y",
408  "2-column matrix with complimentary probabilities of X for "
409  "binary classification");
410 
411 OPERATOR_SCHEMA(MakeTwoClassGradient)
412  .NumInputs(1)
413  .NumOutputs(1);
414 
415 OPERATOR_SCHEMA(SigmoidCrossEntropyWithLogits)
416  .NumInputs(2)
417  .NumOutputs(1)
418  .IdenticalTypeAndShapeOfInputDim(0, 0)
419  .SetDoc(R"DOC(
420 Given two matrices logits and targets, of same shape,
421 (batch_size, num_classes), computes the sigmoid cross entropy between the two.
422 Returns a tensor of shape (batch_size,) of losses for each example.
423 )DOC")
424  .Input(0, "logits", "matrix of logits for each example and class.")
425  .Input(1, "targets", "matrix of targets, same shape as logits.")
426  .Output(0, "xentropy", "Vector with the total xentropy for each example.");
427 
428 OPERATOR_SCHEMA(SigmoidCrossEntropyWithLogitsGradient)
429  .NumInputs(3)
430  .NumOutputs(1);
431 
432 OPERATOR_SCHEMA(WeightedSigmoidCrossEntropyWithLogits)
433  .NumInputs(3)
434  .NumOutputs(1)
435  .IdenticalTypeAndShapeOfInputDim(0, 0)
436  .SetDoc(R"DOC(
437 Given three matrices: logits, targets, weights, all of the same shape,
438 (batch_size, num_classes), computes the weighted sigmoid cross entropy between
439 logits and targets. Specifically, at each position r,c, this computes
440 weights[r, c] * crossentropy(sigmoid(logits[r, c]), targets[r, c]), and then
441 averages over each row.
442 Returns a tensor of shape (batch_size,) of losses for each example.
443 )DOC")
444  .Input(0, "logits", "matrix of logits for each example and class.")
445  .Input(1, "targets", "matrix of targets, same shape as logits.")
446  .Input(2, "weights", "matrix of weights, same shape as logits.")
447  .Output(0, "xentropy", "Vector with the total xentropy for each example.");
448 
449 OPERATOR_SCHEMA(WeightedSigmoidCrossEntropyWithLogitsGradient)
450  .NumInputs(4)
451  .NumOutputs(1);
452 
454  using GradientMakerBase::GradientMakerBase;
455  vector<OperatorDef> GetGradientDefs() override {
456  return SingleGradientDef(
457  "MakeTwoClassGradient",
458  "",
459  vector<string>{GO(0)},
460  vector<string>{GI(0)});
461  }
462 };
463 REGISTER_GRADIENT(MakeTwoClass, GetMakeTwoClassGradient);
464 
466  using GradientMakerBase::GradientMakerBase;
467  vector<OperatorDef> GetGradientDefs() override {
468  return SingleGradientDef(
469  "SigmoidCrossEntropyWithLogitsGradient",
470  "",
471  vector<string>{GO(0), I(0), I(1)},
472  vector<string>{GI(0)});
473  }
474 };
475 REGISTER_GRADIENT(
476  SigmoidCrossEntropyWithLogits,
478 
480  : public GradientMakerBase {
481  using GradientMakerBase::GradientMakerBase;
482  vector<OperatorDef> GetGradientDefs() override {
483  return SingleGradientDef(
484  "WeightedSigmoidCrossEntropyWithLogitsGradient",
485  "",
486  vector<string>{GO(0), I(0), I(1), I(2)},
487  vector<string>{GI(0)});
488  }
489 };
490 REGISTER_GRADIENT(
491  WeightedSigmoidCrossEntropyWithLogits,
493 
494 REGISTER_CPU_OPERATOR(CrossEntropy,
496 REGISTER_CPU_OPERATOR(CrossEntropyGradient,
498 
499 OPERATOR_SCHEMA(CrossEntropy)
500  .NumInputs(2)
501  .NumOutputs(1)
502  .IdenticalTypeAndShapeOfInputDim(0, 0)
503  .SetDoc(R"DOC(
504 Operator computes the cross entropy between the input and the label set. In
505  practice, it is most commonly used at the end of models, after the SoftMax
506  operator and before the AveragedLoss operator. Note that CrossEntropy
507  assumes that the soft labels provided is a 2D array of size N x D
508  (batch size x number of classes). Each entry in the 2D label corresponds to
509  the soft label for the input, where each element represents the correct
510  probability of the class being selected. As such, each element must be between
511  0 and 1, and all elements in an entry must sum to 1. The formula used is:
512 
513  Y[i] = sum_j (label[i][j] * log(X[i][j]))
514 
515  where (i, j) is the classifier's prediction of the jth class (the correct one),
516  and i is the batch size. Each log has a lower limit for numerical stability.
517 )DOC")
518  .Input(
519  0,
520  "X",
521  "Input blob from the previous layer, which is almost always "
522  "the result of a softmax operation; X is a 2D array of size N x D, where N "
523  "is the batch size and D is the number of classes")
524  .Input(1, "label", "Blob containing the labels used to compare the input")
525  .Output(0, "Y", "Output blob after the cross entropy computation");
526 OPERATOR_SCHEMA(CrossEntropyGradient)
527  .NumInputs(3)
528  .NumOutputs(1);
529 
531  using GradientMakerBase::GradientMakerBase;
532  vector<OperatorDef> GetGradientDefs() override {
533  return SingleGradientDef(
534  "CrossEntropyGradient", "",
535  vector<string>{I(0), I(1), GO(0)},
536  vector<string>{GI(0)});
537  }
538 };
539 REGISTER_GRADIENT(CrossEntropy, GetCrossEntropyGradient);
540 
541 } // namespace caffe2
const T * data() const
Returns a typed pointer of the underlying storage.
Definition: tensor.h:484
const vector< TIndex > & dims() const
Returns the dimensions of the tensor as a vector.
Definition: tensor.h:611
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
static vector< OperatorDef > SingleGradientDef(const Args &...args)
a helper function to allow one to create one single operator def, which is usually the case for many ...