1 #include "caffe2/operators/cross_entropy_op.h" 7 inline float sigmoid_xent_forward(
float lgt,
float tgt) {
8 return lgt * (tgt - (lgt >= 0)) - log(1 + exp(lgt - 2 * lgt * (lgt >= 0)));
11 inline float sigmoid_xent_backward(
float lgt,
float tgt) {
12 return tgt - 1. / (1. + exp(-lgt));
17 bool LabelCrossEntropyOp<float, CPUContext>::RunOnDevice() {
19 auto& label = Input(1);
24 D = X.size_from_dim(1);
30 (label.ndim() == 1) || (label.ndim() == 2 && label.dim32(1) == 1));
31 CAFFE_ENFORCE_EQ(label.dim32(0), N);
33 const auto* Xdata = X.data<
float>();
34 const auto* labelData = label.data<
int>();
35 auto* Ydata = Y->mutable_data<
float>();
37 (ConstEigenVectorArrayMap<int>(labelData, N) < D).all() &&
38 (ConstEigenVectorArrayMap<int>(labelData, N) >= 0).all(),
39 "Label seems to be outside of supported range. Supported labels are in " 43 for (
int i = 0; i < N; ++i) {
44 Ydata[i] = -log(std::max(Xdata[i * D + labelData[i]], kLOG_THRESHOLD()));
50 bool SigmoidCrossEntropyWithLogitsOp<float, CPUContext>::RunOnDevice() {
51 auto& logits = Input(0);
52 auto& targets = Input(1);
53 CAFFE_ENFORCE_EQ(logits.dims(), targets.
dims());
54 const auto inner_size = logits.ndim() > 0 ? logits.dims().back() : 1;
55 const auto outer_size = logits.size() / inner_size;
57 auto* out = Output(0);
58 if (logits.ndim() == 0) {
59 out->Resize(std::vector<TIndex>{});
61 std::vector<TIndex> dims(logits.dims().begin(), logits.dims().end() - 1);
64 auto* out_ptr = out->mutable_data<
float>();
66 auto* logits_ptr = logits.data<
float>();
67 auto* targets_ptr = targets.
data<
float>();
70 for (
int i = 0; i < outer_size; ++i) {
72 for (
int j = 0; j < inner_size; ++j) {
73 value += sigmoid_xent_forward(logits_ptr[in_idx], targets_ptr[in_idx]);
76 out_ptr[i] = -value / inner_size;
82 bool SigmoidCrossEntropyWithLogitsGradientOp<float, CPUContext>::RunOnDevice() {
84 auto& logits = Input(1);
85 auto& targets = Input(2);
86 CAFFE_ENFORCE(logits.dims() == targets.
dims());
87 const auto inner_size = logits.ndim() > 0 ? logits.dims().back() : 1;
88 const auto outer_size = logits.size() / inner_size;
89 CAFFE_ENFORCE(g.size() == outer_size);
91 auto* out = Output(0);
92 out->ResizeLike(logits);
93 auto* out_ptr = out->mutable_data<
float>();
95 auto* logits_ptr = logits.data<
float>();
96 auto* targets_ptr = targets.
data<
float>();
97 auto* g_ptr = g.data<
float>();
100 for (
int i = 0; i < outer_size; ++i) {
101 auto g_factor = -g_ptr[i] / inner_size;
102 for (
int j = 0; j < inner_size; ++j) {
103 out_ptr[in_idx] = g_factor *
104 sigmoid_xent_backward(logits_ptr[in_idx], targets_ptr[in_idx]);
112 bool WeightedSigmoidCrossEntropyWithLogitsOp<float, CPUContext>::RunOnDevice() {
113 auto& logits = Input(0);
114 auto& targets = Input(1);
115 auto& weights = Input(2);
116 CAFFE_ENFORCE(logits.dims() == targets.
dims());
117 CAFFE_ENFORCE(weights.dims() == targets.
dims());
118 const auto inner_size = logits.ndim() > 0 ? logits.dims().back() : 1;
119 const auto outer_size = logits.size() / inner_size;
121 auto* out = Output(0);
122 if (logits.ndim() == 0) {
123 out->Resize(std::vector<TIndex>{});
125 std::vector<TIndex> dims(logits.dims().begin(), logits.dims().end() - 1);
128 auto* out_ptr = out->mutable_data<
float>();
130 auto* logits_ptr = logits.data<
float>();
131 auto* targets_ptr = targets.
data<
float>();
132 auto* weights_ptr = weights.data<
float>();
135 for (
int i = 0; i < outer_size; ++i) {
137 for (
int j = 0; j < inner_size; ++j) {
138 value += sigmoid_xent_forward(logits_ptr[in_idx], targets_ptr[in_idx]) *
142 out_ptr[i] = -value / inner_size;
148 bool WeightedSigmoidCrossEntropyWithLogitsGradientOp<float, CPUContext>::
151 auto& logits = Input(1);
152 auto& targets = Input(2);
153 auto& weights = Input(3);
154 CAFFE_ENFORCE(logits.dims() == targets.
dims());
155 CAFFE_ENFORCE(weights.dims() == targets.
dims());
156 const auto inner_size = logits.ndim() > 0 ? logits.dims().back() : 1;
157 const auto outer_size = logits.size() / inner_size;
158 CAFFE_ENFORCE(g.size() == outer_size);
160 auto* out = Output(0);
161 out->ResizeLike(logits);
162 auto* out_ptr = out->mutable_data<
float>();
164 auto* logits_ptr = logits.data<
float>();
165 auto* targets_ptr = targets.
data<
float>();
166 auto* weights_ptr = weights.data<
float>();
167 auto* g_ptr = g.data<
float>();
170 for (
int i = 0; i < outer_size; ++i) {
171 auto g_factor = -g_ptr[i] / inner_size;
172 for (
int j = 0; j < inner_size; ++j) {
173 out_ptr[in_idx] = g_factor *
174 sigmoid_xent_backward(logits_ptr[in_idx], targets_ptr[in_idx]) *
183 bool LabelCrossEntropyGradientOp<float, CPUContext>::RunOnDevice() {
185 auto& label = Input(1);
187 auto* dX = Output(0);
191 D = X.size_from_dim(1);
197 (label.ndim() == 1) || (label.ndim() == 2 && label.dim32(1) == 1));
198 CAFFE_ENFORCE_EQ(label.dim32(0), N);
199 CAFFE_ENFORCE_EQ(dY.ndim(), 1);
200 CAFFE_ENFORCE_EQ(dY.dim32(0), N);
202 math::Set<float, CPUContext>(dX->size(), 0.f, dX->mutable_data<
float>(),
204 const float* Xdata = X.data<
float>();
205 const float* dYdata = dY.data<
float>();
206 const int* labelData = label.data<
int>();
207 float* dXdata = dX->mutable_data<
float>();
208 for (
int i = 0; i < N; ++i) {
209 dXdata[i * D + labelData[i]] =
210 - dYdata[i] / std::max(Xdata[i * D + labelData[i]], kLOG_THRESHOLD());
216 bool MakeTwoClassOp<float, CPUContext>::RunOnDevice() {
219 auto shape = X.dims();
223 const auto* Xdata = X.data<
float>();
224 auto* Ydata = Y->mutable_data<
float>();
225 for (TIndex i = 0; i < N; ++i) {
226 DCHECK_GE(Xdata[i], 0.0);
227 DCHECK_LE(Xdata[i], 1.0);
228 Ydata[i * 2] = 1.0 - Xdata[i];
229 Ydata[i * 2 + 1] = Xdata[i];
235 bool MakeTwoClassGradientOp<float, CPUContext>::RunOnDevice() {
237 auto* dX = Output(0);
238 auto shape = dY.dims();
239 CAFFE_ENFORCE_GE(shape.size(), 1);
240 CAFFE_ENFORCE_EQ(shape.back(), 2);
243 const float* dYdata = dY.data<
float>();
244 float* dXdata = dX->mutable_data<
float>();
245 TIndex N = dX->size();
247 for (TIndex i = 0; i < N; ++i) {
248 dXdata[i] = dYdata[i * 2 + 1] - dYdata[i * 2];
254 bool CrossEntropyOp<float, CPUContext>::RunOnDevice() {
256 auto& label = Input(1);
261 D = X.size_from_dim(1);
267 (label.ndim() == 1) || (label.ndim() == 2 && label.dim32(1) == D));
268 CAFFE_ENFORCE_EQ(label.dim32(0), N);
269 Y->Resize(vector<TIndex>{N});
270 const float* Xdata = X.data<
float>();
271 const float* labelData = label.data<
float>();
272 auto* Ydata = Y->mutable_data<
float>();
274 (ConstEigenArrayMap<float>(labelData, D, N) <= 1.0f).all() &&
275 (ConstEigenArrayMap<float>(labelData, D, N) >= 0.0f).all(),
276 "Soft label seems incorrect: label value should be a probability ",
277 "between 0 and 1.0. You may be using the wrong cross entropy operator; ",
278 "use LabelCrossEntropy if the labels are integers whose values are at ",
279 "most the number of classes, ",
282 EigenArrayMap<float>(Ydata, 1, N) =
283 -(ConstEigenArrayMap<float>(labelData, D, N) *
284 ConstEigenArrayMap<float>(Xdata, D, N).cwiseMax(kLOG_THRESHOLD()).log())
291 bool CrossEntropyGradientOp<float, CPUContext>::RunOnDevice() {
293 auto& label = Input(1);
295 auto* dX = Output(0);
299 D = X.size_from_dim(1);
305 (label.ndim() == 1) || (label.ndim() == 2 && label.dim32(1) == D));
306 CAFFE_ENFORCE_EQ(label.dim32(0), N);
307 CAFFE_ENFORCE_EQ(dY.ndim(), 1);
308 CAFFE_ENFORCE_EQ(dY.dim32(0), N);
310 math::Set<float, CPUContext>(
311 dX->size(), 0.f, dX->mutable_data<
float>(), &context_);
312 const float* Xdata = X.data<
float>();
313 const float* dYdata = dY.data<
float>();
314 const float* labelData = label.data<
float>();
315 float* dXdata = dX->mutable_data<
float>();
316 EigenArrayMap<float>(dXdata, D, N) =
317 (ConstEigenArrayMap<float>(labelData, D, N) /
318 ConstEigenArrayMap<float>(Xdata, D, N).cwiseMax(kLOG_THRESHOLD()))
320 (-ConstEigenVectorArrayMap<float>(dYdata, N).transpose());
324 REGISTER_CPU_OPERATOR(LabelCrossEntropy,
325 LabelCrossEntropyOp<float, CPUContext>);
326 REGISTER_CPU_OPERATOR(LabelCrossEntropyGradient,
327 LabelCrossEntropyGradientOp<float, CPUContext>);
329 OPERATOR_SCHEMA(LabelCrossEntropy)
332 .IdenticalTypeAndShapeOfInputDim(0, 0)
334 Operator computes the cross entropy between the input and the label set. In 335 practice, it is most commonly used at the end of models, after the SoftMax 336 operator and before the AveragedLoss operator. Note that LabelCrossEntropy 337 assumes that the label provided is either a 1D array of size N (batch size), or 338 a 2D array of size N x 1 (batch size). Each entry in the label vector indicates 339 which is the correct class; as such, each entry must be between 0 and D - 1, 340 inclusive, where D is the total number of classes. The formula used is: 344 where (i, j) is the classifier's prediction of the jth class (the correct one), 345 and i is the batch size. Each log has a lower limit for numerical stability. 350 "Input blob from the previous layer, which is almost always " 351 "the result of a softmax operation; X is a 2D array of size N x D, where N " 352 "is the batch size and D is the number of classes")
353 .Input(1,
"label",
"Blob containing the labels used to compare the input")
354 .Output(0,
"Y",
"Output blob after the cross entropy computation");
355 OPERATOR_SCHEMA(LabelCrossEntropyGradient)
360 using GradientMakerBase::GradientMakerBase;
361 vector<OperatorDef> GetGradientDefs()
override {
363 "LabelCrossEntropyGradient",
"",
364 vector<string>{I(0), I(1), GO(0)},
365 vector<string>{GI(0)});
370 REGISTER_CPU_OPERATOR(MakeTwoClass,
372 REGISTER_CPU_OPERATOR(MakeTwoClassGradient,
375 REGISTER_CPU_OPERATOR(
376 SigmoidCrossEntropyWithLogits,
378 REGISTER_CPU_OPERATOR(
379 SigmoidCrossEntropyWithLogitsGradient,
382 REGISTER_CPU_OPERATOR(
383 WeightedSigmoidCrossEntropyWithLogits,
385 REGISTER_CPU_OPERATOR(
386 WeightedSigmoidCrossEntropyWithLogitsGradient,
389 OPERATOR_SCHEMA(MakeTwoClass)
392 .TensorInferenceFunction(
393 [](
const OperatorDef& ,
const vector<TensorShape>& in) {
394 vector<TensorShape> out(1);
395 out[0].add_dims(in[0].dims(0));
400 Given a vector of probabilities, this operator transforms this into a 2-column 401 matrix with complimentary probabilities for binary classification. In explicit 402 terms, given the vector X, the output Y is vstack(1 - X, X). 404 .Input(0, "X",
"Input vector of probabilities")
408 "2-column matrix with complimentary probabilities of X for " 409 "binary classification");
411 OPERATOR_SCHEMA(MakeTwoClassGradient)
415 OPERATOR_SCHEMA(SigmoidCrossEntropyWithLogits)
418 .IdenticalTypeAndShapeOfInputDim(0, 0)
420 Given two matrices logits and targets, of same shape, 421 (batch_size, num_classes), computes the sigmoid cross entropy between the two. 422 Returns a tensor of shape (batch_size,) of losses for each example. 424 .Input(0, "logits",
"matrix of logits for each example and class.")
425 .Input(1,
"targets",
"matrix of targets, same shape as logits.")
426 .Output(0,
"xentropy",
"Vector with the total xentropy for each example.");
428 OPERATOR_SCHEMA(SigmoidCrossEntropyWithLogitsGradient)
432 OPERATOR_SCHEMA(WeightedSigmoidCrossEntropyWithLogits)
435 .IdenticalTypeAndShapeOfInputDim(0, 0)
437 Given three matrices: logits, targets, weights, all of the same shape, 438 (batch_size, num_classes), computes the weighted sigmoid cross entropy between 439 logits and targets. Specifically, at each position r,c, this computes 440 weights[r, c] * crossentropy(sigmoid(logits[r, c]), targets[r, c]), and then 441 averages over each row. 442 Returns a tensor of shape (batch_size,) of losses for each example. 444 .Input(0, "logits",
"matrix of logits for each example and class.")
445 .Input(1,
"targets",
"matrix of targets, same shape as logits.")
446 .Input(2,
"weights",
"matrix of weights, same shape as logits.")
447 .Output(0,
"xentropy",
"Vector with the total xentropy for each example.");
449 OPERATOR_SCHEMA(WeightedSigmoidCrossEntropyWithLogitsGradient)
454 using GradientMakerBase::GradientMakerBase;
455 vector<OperatorDef> GetGradientDefs()
override {
457 "MakeTwoClassGradient",
459 vector<string>{GO(0)},
460 vector<string>{GI(0)});
466 using GradientMakerBase::GradientMakerBase;
467 vector<OperatorDef> GetGradientDefs()
override {
469 "SigmoidCrossEntropyWithLogitsGradient",
471 vector<string>{GO(0), I(0), I(1)},
472 vector<string>{GI(0)});
476 SigmoidCrossEntropyWithLogits,
481 using GradientMakerBase::GradientMakerBase;
482 vector<OperatorDef> GetGradientDefs()
override {
484 "WeightedSigmoidCrossEntropyWithLogitsGradient",
486 vector<string>{GO(0), I(0), I(1), I(2)},
487 vector<string>{GI(0)});
491 WeightedSigmoidCrossEntropyWithLogits,
494 REGISTER_CPU_OPERATOR(CrossEntropy,
496 REGISTER_CPU_OPERATOR(CrossEntropyGradient,
499 OPERATOR_SCHEMA(CrossEntropy)
502 .IdenticalTypeAndShapeOfInputDim(0, 0)
504 Operator computes the cross entropy between the input and the label set. In 505 practice, it is most commonly used at the end of models, after the SoftMax 506 operator and before the AveragedLoss operator. Note that CrossEntropy 507 assumes that the soft labels provided is a 2D array of size N x D 508 (batch size x number of classes). Each entry in the 2D label corresponds to 509 the soft label for the input, where each element represents the correct 510 probability of the class being selected. As such, each element must be between 511 0 and 1, and all elements in an entry must sum to 1. The formula used is: 513 Y[i] = sum_j (label[i][j] * log(X[i][j])) 515 where (i, j) is the classifier's prediction of the jth class (the correct one), 516 and i is the batch size. Each log has a lower limit for numerical stability. 521 "Input blob from the previous layer, which is almost always " 522 "the result of a softmax operation; X is a 2D array of size N x D, where N " 523 "is the batch size and D is the number of classes")
524 .Input(1,
"label",
"Blob containing the labels used to compare the input")
525 .Output(0,
"Y",
"Output blob after the cross entropy computation");
526 OPERATOR_SCHEMA(CrossEntropyGradient)
531 using GradientMakerBase::GradientMakerBase;
532 vector<OperatorDef> GetGradientDefs()
override {
534 "CrossEntropyGradient",
"",
535 vector<string>{I(0), I(1), GO(0)},
536 vector<string>{GI(0)});
const T * data() const
Returns a typed pointer of the underlying storage.
const vector< TIndex > & dims() const
Returns the dimensions of the tensor as a vector.
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
static vector< OperatorDef > SingleGradientDef(const Args &...args)
a helper function to allow one to create one single operator def, which is usually the case for many ...