Caffe2 - C++ API
A deep learning, cross platform ML framework
softmax_with_loss_op.cc
1 #include "softmax_with_loss_op.h"
2 #include "softmax_shared.h"
3 
4 namespace caffe2 {
5 
6 REGISTER_CPU_OPERATOR(SoftmaxWithLoss, SoftmaxWithLossOp<float, CPUContext>);
7 REGISTER_CPU_OPERATOR(
8  SoftmaxWithLossGradient,
9  SoftmaxWithLossGradientOp<float, CPUContext>);
10 
11 // Input: X (logits), T (labels); Output: P (probs), Y
12 OPERATOR_SCHEMA(SoftmaxWithLoss)
13  .NumInputs(2, 3)
14  .NumOutputs(2)
15  .TensorInferenceFunction(
16  [](const OperatorDef& def, const vector<TensorShape>& in) {
17  ArgumentHelper helper(def);
18  auto axis = helper.GetSingleArgument<int32_t>("axis", 1);
19 
20  vector<TensorShape> out(2);
21 
22  auto logits = in[0]; // Tensor with Shape [batch_size, num_classes]
23  auto labels = in[1]; // Tensor with shape [batch_size, ]
24  const auto canonical_axis =
25  canonical_axis_index_(axis, logits.dims().size());
26  const int batch_size =
27  size_to_dim_(canonical_axis, GetDimsVector(logits));
28  const int num_classes =
29  size_from_dim_(canonical_axis, GetDimsVector(logits));
30 
31  out[0].set_data_type(logits.data_type());
32  out[0].add_dims(batch_size);
33  out[0].add_dims(num_classes);
34 
35  return out;
36  })
37  .SetDoc(R"DOC(
38 Combined Softmax and Cross-Entropy loss operator.
39 The operator computes the softmax normalized values for each layer in the batch
40 of the given input, after which cross-entropy loss is computed. This operator is
41 numerically more stable than separate Softmax and CrossEntropy ops.
42 The inputs are a 2-D tensor (Tensor<float>) of size
43 (batch_size x input_feature_dimensions) and tensor of labels (ground truth).
44 Output is tensor with the probability for each label for each example (N x D)
45 and averaged loss (scalar).
46 Use parameter label_prob=1 to enable inputting labels as a probability
47 distribution.
48 Optional third input blob can be used to weight the samples for the loss.
49 )DOC")
50  .Input(0, "logits", "Unscaled log probabilities")
51  .Input(1, "labels", "Ground truth")
52  .Input(
53  2,
54  "weight_tensor",
55  "Optional blob to be used to weight the samples for the loss.")
56  .Output(0, "softmax", "Tensor with softmax cross entropy loss")
57  .Output(1, "loss", "Average loss");
58 
59 // Input: X, T, P, dY; Output: dX
60 OPERATOR_SCHEMA(SoftmaxWithLossGradient).NumOutputs(1);
61 
62 #define DONT_CARE (-1)
63 
64 template <>
65 bool SoftmaxWithLossOp<float, CPUContext>::RunOnDevice() {
66  auto& X = Input(0); // Logits
67  auto& T = Input(1); // Labels / targets
68  auto* P = Output(0); // Probabilities from softmax
69  auto* avg_loss = Output(1); // Average loss
70 
71  const auto canonical_axis = X.canonical_axis_index(axis_);
72  int N, D;
73  N = X.size_to_dim(canonical_axis); // batch size
74  D = X.size_from_dim(canonical_axis);
75  P->ResizeLike(X);
76 
77  if (sum_multiplier_.size() != D) {
78  sum_multiplier_.Resize(D);
79  math::Set<float, CPUContext>(
80  D, 1.f, sum_multiplier_.mutable_data<float>(), &context_);
81  }
82 
83  float* Pdata = P->mutable_data<float>();
84  const float* weights = (InputSize() > 2 ? Input(2).data<float>() : nullptr);
85 
86  if (label_prob_mode_) {
87  CAFFE_ENFORCE_GE(T.ndim(), 2);
88  CAFFE_ENFORCE_EQ(T.size_to_dim(canonical_axis), N);
89  CAFFE_ENFORCE_EQ(T.size_from_dim(canonical_axis), D);
90  } else {
91  if (T.ndim() == canonical_axis) {
92  CAFFE_ENFORCE_EQ(T.size(), N);
93  } else {
94  CAFFE_ENFORCE_EQ(T.size_to_dim(canonical_axis), N);
95  CAFFE_ENFORCE_EQ(T.size_from_dim(canonical_axis), 1);
96  }
97  }
98 
99  if (sum_multiplier_.size() != D) {
100  sum_multiplier_.Resize(D);
101  math::Set<float, CPUContext>(
102  D, 1.f, sum_multiplier_.mutable_data<float>(), &context_);
103  }
104 
105  rowmax_.Resize(N);
106  losses_.Resize(N);
107 
108  SoftmaxCPU(
109  context_,
110  N,
111  D,
112  X.data<float>(),
113  Pdata,
114  losses_.mutable_data<float>(),
115  sum_multiplier_.data<float>(),
116  !label_prob_mode_,
117  rowmax_.mutable_data<float>());
118 
119  // Then compute cross entropy
120  float loss_sum = 0.0;
121  float weight_sum = 0.0;
122  if (!label_prob_mode_) {
123  const int* label_data = T.data<int>();
124  const float* Xdata = X.data<float>();
125 
126  for (int i = 0; i < N; ++i) {
127  CAFFE_ENFORCE(
128  label_data[i] < D && label_data[i] >= 0,
129  "Label seems incorrect: label value larger than number of classes: ",
130  label_data[i],
131  " vs ",
132  D);
133  float weight = weights ? weights[i] : 1.0;
134  float l = -Pdata[i * D + label_data[i]] * weight;
135  loss_sum += l;
136  weight_sum += weight;
137  }
138  math::Exp(N * D, Pdata, Pdata, &context_);
139  } else {
140  const float* label_data = T.data<float>();
141 
142  for (int i = 0; i < N; ++i) {
143  float l = 0.0;
144  float total_prob = 0.0;
145  float weight = weights ? weights[i] : 1.0;
146  for (int j = 0; j < D; ++j) {
147  CAFFE_ENFORCE(
148  label_data[i * D + j] >= 0,
149  "Label prob seems incorrect: label prob value must be nonnegative:",
150  " ",
151  label_data[i * D + j]);
152  l += -log(std::max(Pdata[i * D + j], 1e-20f)) * label_data[i * D + j] *
153  weight;
154  total_prob += label_data[i * D + j];
155  }
156  loss_sum += l;
157  CAFFE_ENFORCE(
158  std::abs(total_prob - 1.) < 1e-5f,
159  "Label prob seems incorrect: label prob values do not sum to 1.0: ",
160  total_prob,
161  " vs 1.0 (+/- 1e-5)");
162  weight_sum += weight;
163  }
164  }
165 
166  avg_loss->Resize(vector<TIndex>());
167  float* avg_loss_data = avg_loss->mutable_data<float>();
168  if (weight_sum != 0.0) {
169  avg_loss_data[0] = loss_sum * scale_ / weight_sum;
170  } else {
171  avg_loss_data[0] = 0.0;
172  }
173  return true;
174 }
175 
176 template <>
177 bool SoftmaxWithLossGradientOp<float, CPUContext>::RunOnDevice() {
178  auto& X = Input(0); // Logits
179  auto& T = Input(1); // Labels / targets
180  // Input(2) is weights if given
181  auto& P = Input(InputSize() - 2); // Probabilities from softmax
182  auto& d_avg_loss = Input(InputSize() - 1); // Gradient w.r.t. avg loss
183  auto* dX = Output(0);
184  const float* weights = (InputSize() > 4 ? Input(2).data<float>() : nullptr);
185 
186  const auto canonical_axis = X.canonical_axis_index(axis_);
187  int N, D;
188  N = X.size_to_dim(canonical_axis); // batch size
189  D = X.size_from_dim(canonical_axis);
190  dX->ResizeLike(X);
191 
192  if (label_prob_mode_) {
193  CAFFE_ENFORCE_GE(T.ndim(), 2);
194  CAFFE_ENFORCE_EQ(T.size_to_dim(canonical_axis), N);
195  CAFFE_ENFORCE_EQ(T.size_from_dim(canonical_axis), D);
196  } else {
197  if (T.ndim() == canonical_axis) {
198  CAFFE_ENFORCE_EQ(T.size(), N);
199  } else {
200  CAFFE_ENFORCE_EQ(T.size_to_dim(canonical_axis), N);
201  CAFFE_ENFORCE_EQ(T.size_from_dim(canonical_axis), 1);
202  }
203  }
204 
205  const float* Pdata = P.data<float>();
206  float* dX_data = dX->mutable_data<float>();
207 
208  // Copy softmax probabilities into dX. All but the neuron
209  // corresponding to the correct label has gradient equaling e(x_j)
210  // which is the probability under softmax.
211  context_.Copy<float, CPUContext, CPUContext>(P.size(), Pdata, dX_data);
212 
213  // Compute gradient for the matching labels.
214  float total_weight = 0.0f;
215  if (!label_prob_mode_) {
216  const int* label_data = T.data<int>();
217 
218  if (weights) {
219  for (int i = 0; i < N; ++i) {
220  int idx = i * D + label_data[i];
221  float weight = weights[i];
222  dX_data[idx] = Pdata[idx] - 1.0;
223  for (int d = 0; d < D; d++) {
224  int k = i * D + d;
225  dX_data[k] *= weight;
226  }
227 
228  total_weight += weight;
229  }
230  } else {
231  for (int i = 0; i < N; ++i) {
232  int idx = i * D + label_data[i];
233  dX_data[idx] = Pdata[idx] - 1.0f;
234  }
235  total_weight = N;
236  }
237  } else {
238  const float* label_data = T.data<float>();
239 
240  if (weights) {
241  for (int i = 0; i < N; ++i) {
242  float weight = weights[i];
243  for (int j = 0; j < D; ++j) {
244  int idx = i * D + j;
245  dX_data[idx] = (Pdata[idx] - label_data[idx]) * weight;
246  }
247  total_weight += weight;
248  }
249  } else {
250  for (int i = 0; i < N; ++i) {
251  for (int j = 0; j < D; ++j) {
252  int idx = i * D + j;
253  dX_data[idx] = Pdata[idx] - label_data[idx];
254  }
255  }
256  total_weight = N;
257  }
258  }
259 
260  // Scale by d_avg_loss / N
261  if (total_weight > 0) {
262  math::Scale<float, CPUContext>(
263  dX->size(),
264  scale_ / total_weight * d_avg_loss.data<float>()[0],
265  dX->data<float>(),
266  dX_data,
267  &context_);
268  }
269  return true;
270 }
271 
272 namespace {
273 class GetSoftmaxWithLossGradient : public GradientMakerBase {
274  using GradientMakerBase::GradientMakerBase;
275  vector<OperatorDef> GetGradientDefs() override {
276  vector<string> blob_names{
277  {I(0), I(1), O(0), GO(1)},
278  };
279 
280  // Add weight blob, if given
281  if (def_.input_size() == 3) {
282  blob_names.emplace(blob_names.begin() + 2, I(2));
283  }
284  return SingleGradientDef(
285  "SoftmaxWithLossGradient", "", blob_names, vector<string>{GI(0)});
286  }
287 };
288 
289 REGISTER_GRADIENT(SoftmaxWithLoss, GetSoftmaxWithLossGradient);
290 }
291 } // namespace caffe2
TIndex size_from_dim_(int k, const vector< TIndex > &dims)
Return product of all dimensions starting from K.
Definition: tensor.h:40
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...