1 #include "softmax_with_loss_op.h" 2 #include "softmax_shared.h" 6 REGISTER_CPU_OPERATOR(SoftmaxWithLoss, SoftmaxWithLossOp<float, CPUContext>);
8 SoftmaxWithLossGradient,
9 SoftmaxWithLossGradientOp<float, CPUContext>);
12 OPERATOR_SCHEMA(SoftmaxWithLoss)
15 .TensorInferenceFunction(
16 [](
const OperatorDef& def,
const vector<TensorShape>& in) {
17 ArgumentHelper helper(def);
18 auto axis = helper.GetSingleArgument<int32_t>(
"axis", 1);
20 vector<TensorShape> out(2);
24 const auto canonical_axis =
25 canonical_axis_index_(axis, logits.dims().size());
26 const int batch_size =
27 size_to_dim_(canonical_axis, GetDimsVector(logits));
28 const int num_classes =
31 out[0].set_data_type(logits.data_type());
32 out[0].add_dims(batch_size);
33 out[0].add_dims(num_classes);
38 Combined Softmax and Cross-Entropy loss operator. 39 The operator computes the softmax normalized values for each layer in the batch 40 of the given input, after which cross-entropy loss is computed. This operator is 41 numerically more stable than separate Softmax and CrossEntropy ops. 42 The inputs are a 2-D tensor (Tensor<float>) of size 43 (batch_size x input_feature_dimensions) and tensor of labels (ground truth). 44 Output is tensor with the probability for each label for each example (N x D) 45 and averaged loss (scalar). 46 Use parameter label_prob=1 to enable inputting labels as a probability 48 Optional third input blob can be used to weight the samples for the loss. 50 .Input(0, "logits",
"Unscaled log probabilities")
51 .Input(1,
"labels",
"Ground truth")
55 "Optional blob to be used to weight the samples for the loss.")
56 .Output(0,
"softmax",
"Tensor with softmax cross entropy loss")
57 .Output(1,
"loss",
"Average loss");
60 OPERATOR_SCHEMA(SoftmaxWithLossGradient).NumOutputs(1);
62 #define DONT_CARE (-1) 65 bool SoftmaxWithLossOp<float, CPUContext>::RunOnDevice() {
69 auto* avg_loss = Output(1);
71 const auto canonical_axis = X.canonical_axis_index(axis_);
73 N = X.size_to_dim(canonical_axis);
74 D = X.size_from_dim(canonical_axis);
77 if (sum_multiplier_.size() != D) {
78 sum_multiplier_.Resize(D);
79 math::Set<float, CPUContext>(
80 D, 1.f, sum_multiplier_.mutable_data<
float>(), &context_);
83 float* Pdata = P->mutable_data<
float>();
84 const float* weights = (InputSize() > 2 ? Input(2).data<
float>() :
nullptr);
86 if (label_prob_mode_) {
87 CAFFE_ENFORCE_GE(T.ndim(), 2);
88 CAFFE_ENFORCE_EQ(T.size_to_dim(canonical_axis), N);
89 CAFFE_ENFORCE_EQ(T.size_from_dim(canonical_axis), D);
91 if (T.ndim() == canonical_axis) {
92 CAFFE_ENFORCE_EQ(T.size(), N);
94 CAFFE_ENFORCE_EQ(T.size_to_dim(canonical_axis), N);
95 CAFFE_ENFORCE_EQ(T.size_from_dim(canonical_axis), 1);
99 if (sum_multiplier_.size() != D) {
100 sum_multiplier_.Resize(D);
101 math::Set<float, CPUContext>(
102 D, 1.f, sum_multiplier_.mutable_data<
float>(), &context_);
114 losses_.mutable_data<
float>(),
115 sum_multiplier_.data<
float>(),
117 rowmax_.mutable_data<
float>());
120 float loss_sum = 0.0;
121 float weight_sum = 0.0;
122 if (!label_prob_mode_) {
123 const int* label_data = T.data<
int>();
124 const float* Xdata = X.data<
float>();
126 for (
int i = 0; i < N; ++i) {
128 label_data[i] < D && label_data[i] >= 0,
129 "Label seems incorrect: label value larger than number of classes: ",
133 float weight = weights ? weights[i] : 1.0;
134 float l = -Pdata[i * D + label_data[i]] * weight;
136 weight_sum += weight;
138 math::Exp(N * D, Pdata, Pdata, &context_);
140 const float* label_data = T.data<
float>();
142 for (
int i = 0; i < N; ++i) {
144 float total_prob = 0.0;
145 float weight = weights ? weights[i] : 1.0;
146 for (
int j = 0; j < D; ++j) {
148 label_data[i * D + j] >= 0,
149 "Label prob seems incorrect: label prob value must be nonnegative:",
151 label_data[i * D + j]);
152 l += -log(std::max(Pdata[i * D + j], 1e-20f)) * label_data[i * D + j] *
154 total_prob += label_data[i * D + j];
158 std::abs(total_prob - 1.) < 1e-5f,
159 "Label prob seems incorrect: label prob values do not sum to 1.0: ",
161 " vs 1.0 (+/- 1e-5)");
162 weight_sum += weight;
166 avg_loss->Resize(vector<TIndex>());
167 float* avg_loss_data = avg_loss->mutable_data<
float>();
168 if (weight_sum != 0.0) {
169 avg_loss_data[0] = loss_sum * scale_ / weight_sum;
171 avg_loss_data[0] = 0.0;
177 bool SoftmaxWithLossGradientOp<float, CPUContext>::RunOnDevice() {
181 auto& P = Input(InputSize() - 2);
182 auto& d_avg_loss = Input(InputSize() - 1);
183 auto* dX = Output(0);
184 const float* weights = (InputSize() > 4 ? Input(2).data<
float>() :
nullptr);
186 const auto canonical_axis = X.canonical_axis_index(axis_);
188 N = X.size_to_dim(canonical_axis);
189 D = X.size_from_dim(canonical_axis);
192 if (label_prob_mode_) {
193 CAFFE_ENFORCE_GE(T.ndim(), 2);
194 CAFFE_ENFORCE_EQ(T.size_to_dim(canonical_axis), N);
195 CAFFE_ENFORCE_EQ(T.size_from_dim(canonical_axis), D);
197 if (T.ndim() == canonical_axis) {
198 CAFFE_ENFORCE_EQ(T.size(), N);
200 CAFFE_ENFORCE_EQ(T.size_to_dim(canonical_axis), N);
201 CAFFE_ENFORCE_EQ(T.size_from_dim(canonical_axis), 1);
205 const float* Pdata = P.data<
float>();
206 float* dX_data = dX->mutable_data<
float>();
211 context_.Copy<float, CPUContext, CPUContext>(P.size(), Pdata, dX_data);
214 float total_weight = 0.0f;
215 if (!label_prob_mode_) {
216 const int* label_data = T.data<
int>();
219 for (
int i = 0; i < N; ++i) {
220 int idx = i * D + label_data[i];
221 float weight = weights[i];
222 dX_data[idx] = Pdata[idx] - 1.0;
223 for (
int d = 0; d < D; d++) {
225 dX_data[k] *= weight;
228 total_weight += weight;
231 for (
int i = 0; i < N; ++i) {
232 int idx = i * D + label_data[i];
233 dX_data[idx] = Pdata[idx] - 1.0f;
238 const float* label_data = T.data<
float>();
241 for (
int i = 0; i < N; ++i) {
242 float weight = weights[i];
243 for (
int j = 0; j < D; ++j) {
245 dX_data[idx] = (Pdata[idx] - label_data[idx]) * weight;
247 total_weight += weight;
250 for (
int i = 0; i < N; ++i) {
251 for (
int j = 0; j < D; ++j) {
253 dX_data[idx] = Pdata[idx] - label_data[idx];
261 if (total_weight > 0) {
262 math::Scale<float, CPUContext>(
264 scale_ / total_weight * d_avg_loss.data<
float>()[0],
273 class GetSoftmaxWithLossGradient :
public GradientMakerBase {
274 using GradientMakerBase::GradientMakerBase;
275 vector<OperatorDef> GetGradientDefs()
override {
276 vector<string> blob_names{
277 {I(0), I(1), O(0), GO(1)},
281 if (def_.input_size() == 3) {
282 blob_names.emplace(blob_names.begin() + 2, I(2));
284 return SingleGradientDef(
285 "SoftmaxWithLossGradient",
"", blob_names, vector<string>{GI(0)});
289 REGISTER_GRADIENT(SoftmaxWithLoss, GetSoftmaxWithLossGradient);
TIndex size_from_dim_(int k, const vector< TIndex > &dims)
Return product of all dimensions starting from K.
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...