Caffe2 - C++ API
A deep learning, cross platform ML framework
distance_op.cc
1 #include "caffe2/operators/distance_op.h"
2 
3 namespace caffe2 {
4 
5 template<>
6 bool SquaredL2DistanceOp<float, CPUContext>::RunOnDevice() {
7  auto& X = Input(0);
8  auto& Y = Input(1);
9  auto* distance = Output(0);
10  CAFFE_ENFORCE_EQ(X.ndim(), Y.ndim());
11  for (int i = 0; i < X.ndim(); ++i) {
12  CAFFE_ENFORCE_EQ(X.dim32(i), Y.dim32(i));
13  }
14  int N = X.ndim() > 0 ? X.dim32(0) : 1;
15  distance->Resize(N);
16  int D = N > 0 ? X.size() / N : 0;
17  float* distance_data = distance->mutable_data<float>();
18  const float* X_data = X.data<float>();
19  const float* Y_data = Y.data<float>();
20  for (int i = 0; i < N; ++i) {
21  float Xscale, Yscale, cross;
22  math::Dot<float, CPUContext>(
23  D, X_data + i * D, X_data + i * D, &Xscale, &context_);
24  math::Dot<float, CPUContext>(
25  D, Y_data + i * D, Y_data + i * D, &Yscale, &context_);
26  math::Dot<float, CPUContext>(
27  D, X_data + i * D, Y_data + i * D, &cross, &context_);
28  distance_data[i] = (Xscale + Yscale) * 0.5 - cross;
29  }
30  return true;
31 }
32 
33 template <>
34 bool L1DistanceOp<float, CPUContext>::RunOnDevice() {
35  auto& X = Input(0);
36  auto& Y = Input(1);
37  auto* distance = Output(0);
38  CAFFE_ENFORCE_EQ(X.ndim(), Y.ndim());
39  for (int i = 0; i < X.ndim(); ++i) {
40  CAFFE_ENFORCE_EQ(X.dim32(i), Y.dim32(i));
41  }
42  int N = X.ndim() > 0 ? X.dim32(0) : 1;
43  distance->Resize(N);
44  int D = N > 0 ? X.size() / N : 0;
45 
46  const float* X_data = X.data<float>();
47  const float* Y_data = Y.data<float>();
48 
49  for (int i = 0; i < N; ++i) {
50  (distance->mutable_data<float>())[i] =
51  (ConstEigenVectorMap<float>(X_data + i * D, D).array() -
52  ConstEigenVectorMap<float>(Y_data + i * D, D).array())
53  .abs()
54  .sum();
55  }
56  return true;
57 }
58 
59 template <>
60 bool L1DistanceGradientOp<float, CPUContext>::RunOnDevice() {
61  auto& X = Input(0);
62  auto& Y = Input(1);
63  auto& dDistance = Input(2);
64  auto* dX = Output(0);
65  auto* dY = Output(1);
66  CAFFE_ENFORCE_EQ(X.ndim(), Y.ndim());
67  for (int i = 0; i < X.ndim(); ++i) {
68  CAFFE_ENFORCE_EQ(X.dim32(i), Y.dim32(i));
69  }
70  int N = X.ndim() > 0 ? X.dim32(0) : 1;
71  int D = N > 0 ? X.size() / N : 0;
72  CAFFE_ENFORCE(X.ndim() == Y.ndim());
73  for (int i = 0; i < X.ndim(); ++i) {
74  CAFFE_ENFORCE(X.dim32(i) == Y.dim32(i));
75  }
76  CAFFE_ENFORCE(dDistance.ndim() == 1);
77  CAFFE_ENFORCE(dDistance.dim32(0) == N);
78  dX->ResizeLike(X);
79  dY->ResizeLike(Y);
80 
81  for (int i = 0; i < N; ++i) {
82  auto offset = i * D;
83  for (int j = 0; j < D; ++j) {
84  const float temp =
85  (X.data<float>())[offset + j] - (Y.data<float>())[offset + j];
86  const float kEps = 1e-12f;
87  if (temp < -kEps) {
88  dX->mutable_data<float>()[offset + j] = -(dDistance.data<float>())[i];
89  dY->mutable_data<float>()[offset + j] = (dDistance.data<float>())[i];
90  } else if (temp > kEps) {
91  dX->mutable_data<float>()[offset + j] = (dDistance.data<float>())[i];
92  dY->mutable_data<float>()[offset + j] = -(dDistance.data<float>())[i];
93  } else {
94  dX->mutable_data<float>()[offset + j] = 0;
95  dY->mutable_data<float>()[offset + j] = 0;
96  }
97  }
98  }
99  return true;
100 }
101 
102 template <>
103 bool CosineSimilarityOp<float, CPUContext>::RunOnDevice() {
104  auto& X = Input(X_IN);
105  auto& Y = Input(Y_IN);
106  auto* result = Output(COS_OUT);
107  CAFFE_ENFORCE_EQ(X.ndim(), Y.ndim());
108  for (int i = 0; i < X.ndim(); ++i) {
109  CAFFE_ENFORCE_EQ(X.dim32(i), Y.dim32(i));
110  }
111  const int N = X.ndim() > 0 ? X.dim32(0) : 1;
112  const int D = X.size_from_dim(1);
113  result->Resize(N);
114  float* result_data = result->mutable_data<float>();
115  const float* X_data = X.data<float>();
116  const float* Y_data = Y.data<float>();
117  float X2, Y2;
118  const float kEps = 1e-12f;
119  for (int i = 0; i < N; ++i) { // TODO: multithreading
120  auto offset = i * D;
121  math::Dot<float, CPUContext>(
122  D, X_data + offset, X_data + offset, &X2, &context_);
123  math::Dot<float, CPUContext>(
124  D, Y_data + offset, Y_data + offset, &Y2, &context_);
125  math::Dot<float, CPUContext>(
126  D, X_data + offset, Y_data + offset, result_data + i, &context_);
127  result_data[i] /= std::sqrt(std::max(X2, kEps) * std::max(Y2, kEps));
128  }
129  return true;
130 }
131 
132 template <>
133 bool CosineSimilarityGradientOp<float, CPUContext>::RunOnDevice() {
134  auto& X = Input(X_IN);
135  auto& Y = Input(Y_IN);
136  auto& dCos = Input(DER_COS_IN);
137  auto* dX = Output(DER_X_OUT);
138  auto* dY = Output(DER_Y_OUT);
139  const int N = X.ndim() > 0 ? X.dim32(0) : 1;
140  const int D = X.size_from_dim(1);
141  CAFFE_ENFORCE(X.ndim() == Y.ndim());
142  for (int i = 0; i < X.ndim(); ++i) {
143  CAFFE_ENFORCE(X.dim32(i) == Y.dim32(i));
144  }
145  CAFFE_ENFORCE(dCos.ndim() == 1);
146  CAFFE_ENFORCE(dCos.dim32(0) == N);
147  dX->ResizeLike(X);
148  dY->ResizeLike(Y);
149 
150  const auto* X_data = X.template data<float>();
151  const auto* Y_data = Y.template data<float>();
152  const auto* dCos_data = dCos.template data<float>();
153  auto* dX_data = dX->template mutable_data<float>();
154  auto* dY_data = dY->template mutable_data<float>();
155  float XN, YN, XY;
156  const float kEps = 1e-12f;
157  for (int i = 0; i < N; ++i) { // TODO: multithreading
158  auto offset = i * D;
159 
160  // TODO: cache these result from the forward pass
161  // ||x||
162  math::Dot<float, CPUContext>(
163  D, X_data + offset, X_data + offset, &XN, &context_);
164  XN = std::sqrt(std::max(XN, kEps));
165  // ||y||
166  math::Dot<float, CPUContext>(
167  D, Y_data + offset, Y_data + offset, &YN, &context_);
168  YN = std::sqrt(std::max(YN, kEps));
169  // ||x|| * || y ||
170  float XYN = XN * YN;
171  // x^Ty
172  math::Dot<float, CPUContext>(
173  D, X_data + offset, Y_data + offset, &XY, &context_);
174 
175  math::Scale<float, CPUContext>(
176  D, dCos_data[i] / XYN, Y_data + offset, dX_data + offset, &context_);
177  math::Axpy(
178  D,
179  -dCos_data[i] * XY / (XN * XN * XYN),
180  X_data + offset,
181  dX_data + offset,
182  &context_);
183 
184  math::Scale<float, CPUContext>(
185  D, dCos_data[i] / XYN, X_data + offset, dY_data + offset, &context_);
186  math::Axpy(
187  D,
188  -dCos_data[i] * XY / (YN * YN * XYN),
189  Y_data + offset,
190  dY_data + offset,
191  &context_);
192  }
193 
194  return true;
195 }
196 
197 template <>
198 bool DotProductOp<float, CPUContext>::RunOnDevice() {
199  auto& X = Input(X_IN);
200  auto& Y = Input(Y_IN);
201  auto* result = Output(DOT_OUT);
202  CAFFE_ENFORCE_EQ(X.ndim(), Y.ndim());
203  for (int i = 0; i < X.ndim(); ++i) {
204  CAFFE_ENFORCE_EQ(X.dim32(i), Y.dim32(i), "dimension at ", i);
205  }
206  int N, D;
207  if (X.size() > 0) {
208  N = X.ndim() > 0 ? X.dim32(0) : 1;
209  D = X.size() / N;
210  } else {
211  N = 0;
212  D = 0;
213  }
214  result->Resize(N);
215  float* result_data = result->template mutable_data<float>();
216  const float* X_data = X.template data<float>();
217  const float* Y_data = Y.template data<float>();
218  for (int i = 0; i < N; ++i) { // TODO: multithreading
219  auto offset = i * D;
220  math::Dot<float, CPUContext>(
221  D, X_data + offset, Y_data + offset, result_data + i, &context_);
222  }
223  return true;
224 }
225 
226 OpSchema::Cost CostInferenceForDotProduct(
227  const OperatorDef& def,
228  const vector<TensorShape>& in) {
229  struct OpSchema::Cost c = PointwiseCostInference<1>(def, in);
230  c.params_bytes = 0;
231  return c;
232 }
233 
234 template <>
235 bool DotProductGradientOp<float, CPUContext>::RunOnDevice() {
236  auto& X = Input(X_IN);
237  auto& Y = Input(Y_IN);
238  auto& dDot = Input(DER_DOT_IN);
239  auto* dX = Output(DER_X_OUT);
240  auto* dY = Output(DER_Y_OUT);
241  int N, D;
242  if (X.size() > 0) {
243  N = X.ndim() > 0 ? X.dim32(0) : 1;
244  D = X.size() / N;
245  } else {
246  N = 0;
247  D = 0;
248  }
249  CAFFE_ENFORCE(X.ndim() == Y.ndim());
250  for (int i = 0; i < X.ndim(); ++i) {
251  CAFFE_ENFORCE(X.dim32(i) == Y.dim32(i));
252  }
253  CAFFE_ENFORCE(dDot.ndim() == 1);
254  CAFFE_ENFORCE(dDot.dim32(0) == N);
255  dX->ResizeLike(X);
256  dY->ResizeLike(Y);
257 
258  const auto* X_data = X.template data<float>();
259  const auto* Y_data = Y.template data<float>();
260  const auto* dDot_data = dDot.template data<float>();
261  auto* dX_data = dX->template mutable_data<float>();
262  auto* dY_data = dY->template mutable_data<float>();
263  for (int i = 0; i < N; ++i) { // TODO: multithreading
264  auto offset = i * D;
265  math::Scale<float, CPUContext>(
266  D, dDot_data[i], X_data + offset, dY_data + offset, &context_);
267  math::Scale<float, CPUContext>(
268  D, dDot_data[i], Y_data + offset, dX_data + offset, &context_);
269  }
270  return true;
271 }
272 
273 template <>
274 bool DotProductWithPaddingOp<float, CPUContext>::RunOnDevice() {
275  auto& X = Input(X_IN);
276  auto& Y = Input(Y_IN);
277  auto* result = Output(DOT_OUT);
278  CAFFE_ENFORCE_EQ(X.ndim(), Y.ndim());
279  CAFFE_ENFORCE_EQ(X.dim32(0), Y.dim32(0));
280 
281  int N, D, DX, DY, restD;
282  if (X.size() > 0) {
283  N = X.ndim() > 0 ? X.dim32(0) : 1;
284  DX = X.size() / N;
285  DY = Y.size() / N;
286  } else {
287  N = 0;
288  DX = 0;
289  DY = 0;
290  }
291 
292  D = std::min(DX, DY);
293  restD = std::max(DX, DY) - D;
294  result->Resize(N);
295  float* result_data = result->mutable_data<float>();
296  const float* X_data = X.data<float>();
297  const float* Y_data = Y.data<float>();
298  for (int i = 0; i < N; ++i) { // TODO: multithreading
299  auto offsetX = i * DX, offsetY = i * DY;
300  if (replicate_) {
301  // L_ for longer vector and S_ for shorter vector
302  const float *L_data, *S_data;
303  int DL, DS;
304  if (DX > DY) {
305  L_data = X_data + offsetX;
306  S_data = Y_data + offsetY;
307  DL = DX;
308  DS = DY;
309  } else {
310  L_data = Y_data + offsetY;
311  S_data = X_data + offsetX;
312  DL = DY;
313  DS = DX;
314  }
315  float sum = 0.0;
316  float tmp = 0.0;
317  for (int j = 0; j < DL / DS; j++) {
318  math::Dot<float, CPUContext>(
319  DS, L_data + j * DS, S_data, &tmp, &context_);
320  sum += tmp;
321  }
322  *(result_data + i) = sum;
323  } else {
324  math::Dot<float, CPUContext>(
325  D, X_data + offsetX, Y_data + offsetY, result_data + i, &context_);
326  }
327 
328  if (!replicate_ && DX != DY) {
329  const float* rest_data;
330  float rest_sum = 0;
331  if (DX > DY) {
332  rest_data = X_data + offsetX + D;
333  } else {
334  rest_data = Y_data + offsetY + D;
335  }
336  math::Sum<float, CPUContext>(restD, rest_data, &rest_sum, &context_);
337  result_data[i] += rest_sum * pad_value_;
338  }
339  }
340  return true;
341 }
342 
343 // L2
344 REGISTER_CPU_OPERATOR(SquaredL2Distance,
345  SquaredL2DistanceOp<float, CPUContext>);
346 REGISTER_CPU_OPERATOR(SquaredL2DistanceGradient,
347  SquaredL2DistanceGradientOp<float, CPUContext>);
348 
349 OPERATOR_SCHEMA(SquaredL2Distance)
350  .NumInputs(2)
351  .NumOutputs(1)
352  .IdenticalTypeAndShapeOfInputDim(0, 0)
353  .SetDoc(R"DOC(
354 Given two input float tensors X, Y, and produces one output float tensor
355 of the L2 difference between X and Y that is computed as ||(X - Y)^2 / 2||.
356 )DOC")
357  .Input(0, "X", "1D or 2D input tensor")
358  .Input(1, "Y", "1D or 2D input tensor (must have the same shape as X)")
359  .Output(0, "Z", "1D output tensor");
361 OPERATOR_SCHEMA(SquaredL2DistanceGradient).NumInputs(3).NumOutputs(2);
362 
364  using GradientMakerBase::GradientMakerBase;
365  vector<OperatorDef> GetGradientDefs() override {
366  return SingleGradientDef(
367  "SquaredL2DistanceGradient", "",
368  vector<string>{I(0), I(1), GO(0)},
369  vector<string>{GI(0), GI(1)});
370  }
371 };
372 REGISTER_GRADIENT(SquaredL2Distance, GetSquaredL2DistanceGradient);
373 
374 // L1
375 REGISTER_CPU_OPERATOR(L1Distance, L1DistanceOp<float, CPUContext>);
376 REGISTER_CPU_OPERATOR(
377  L1DistanceGradient,
379 
380 OPERATOR_SCHEMA(L1Distance)
381  .NumInputs(2)
382  .NumOutputs(1)
383  .IdenticalTypeAndShapeOfInputDim(0, 0)
384  .SetDoc(R"DOC(
385 Given two input float tensors X, Y, and produces one output float tensor
386 of the L1 difference between X and Y, computed as L1(x,y) = sum over |x-y|
387 )DOC")
388  .Input(0, "X", "1D or 2D input tensor")
389  .Input(1, "Y", "1D or 2D input tensor (must have the same shape as X)")
390  .Output(0, "Z", "1D output tensor");
391 
392 OPERATOR_SCHEMA(L1DistanceGradient).NumInputs(3).NumOutputs(2);
393 
395  using GradientMakerBase::GradientMakerBase;
396  vector<OperatorDef> GetGradientDefs() override {
397  return SingleGradientDef(
398  "L1DistanceGradient",
399  "",
400  vector<string>{I(0), I(1), GO(0)},
401  vector<string>{GI(0), GI(1)});
402  }
403 };
404 
405 REGISTER_GRADIENT(L1Distance, GetL1DistanceGradient);
406 
407 // Dot Product
408 REGISTER_CPU_OPERATOR(DotProduct, DotProductOp<float, CPUContext>);
409 REGISTER_CPU_OPERATOR(
410  DotProductGradient,
412 
413 OPERATOR_SCHEMA(DotProduct)
414  .NumInputs(2)
415  .NumOutputs(1)
416  .IdenticalTypeAndShapeOfInputDim(0, 0)
417  .SetDoc(R"DOC(
418 Given two input float tensors X, Y, and produces one output float tensor
419 of the dot product between X and Y.
420 )DOC")
421  .Input(0, "X", "1D or 2D input tensor")
422  .Input(1, "Y", "1D or 2D input tensor (must have the same shape as X)")
423  .Output(0, "Z", "1D output tensor")
424  .CostInferenceFunction(
425  OpSchema::CostInferenceFunctionType(CostInferenceForDotProduct));
426 
427 OPERATOR_SCHEMA(DotProductGradient).NumInputs(3).NumOutputs(2);
428 
430  using GradientMakerBase::GradientMakerBase;
431  vector<OperatorDef> GetGradientDefs() override {
432  return SingleGradientDef(
433  "DotProductGradient",
434  "",
435  vector<string>{I(0), I(1), GO(0)},
436  vector<string>{GI(0), GI(1)});
437  }
438 };
439 REGISTER_GRADIENT(DotProduct, GetDotProductGradient);
440 
441 // Cosine Similarity
442 REGISTER_CPU_OPERATOR(CosineSimilarity, CosineSimilarityOp<float, CPUContext>);
443 REGISTER_CPU_OPERATOR(
444  CosineSimilarityGradient,
446 
447 OPERATOR_SCHEMA(CosineSimilarity)
448  .NumInputs(2)
449  .NumOutputs(1)
450  .IdenticalTypeAndShapeOfInputDim(0, 0)
451  .SetDoc(R"DOC(
452 Given two input float tensors X, Y, and produces one output float tensor
453 of the cosine similarity between X and Y.
454 )DOC")
455  .Input(0, "X", "1D or 2D input tensor")
456  .Input(1, "Y", "1D or 2D input tensor (must have the same shape as X)")
457  .Output(0, "Z", "1D output tensor");
458 
459 OPERATOR_SCHEMA(CosineSimilarityGradient).NumInputs(3).NumOutputs(2);
460 
462  using GradientMakerBase::GradientMakerBase;
463  vector<OperatorDef> GetGradientDefs() override {
464  return SingleGradientDef(
465  "CosineSimilarityGradient",
466  "",
467  vector<string>{I(0), I(1), GO(0)},
468  vector<string>{GI(0), GI(1)});
469  }
470 };
471 REGISTER_GRADIENT(CosineSimilarity, GetCosineSimilarityGradient);
472 
473 // Dot Product allows padding
474 REGISTER_CPU_OPERATOR(
475  DotProductWithPadding,
477 REGISTER_CPU_OPERATOR(
478  DotProductWithPaddingGradient,
480 
481 OPERATOR_SCHEMA(DotProductWithPadding)
482  .NumInputs(2)
483  .NumOutputs(1)
484  .SetDoc(R"DOC(
485 Given two input float tensors X, Y with different shapes and produces one
486 output float tensor of the dot product between X and Y. We currently support
487 two kinds of strategies to achieve this. Before doing normal dot_product 1)
488 pad the smaller tensor (using pad_value) to the same shape as the other one.
489 2) replicate the smaller tensor to the same shape as the other one. Note the
490 first dimension of X, Y must be equal. Only the second dimension of X or Y
491 can be padded.
492 )DOC")
493  .Input(0, "X", "1D or 2D input tensor")
494  .Input(1, "Y", "1D or 2D input tensor")
495  .Output(0, "Z", "1D output tensor")
496  .IdenticalTypeAndShapeOfInputDim(0, 0)
497  .Arg("pad_value", "the padding value for tensors with smaller dimension")
498  .Arg("replicate", "whether to replicate the smaller tensor or not");
499 
500 OPERATOR_SCHEMA(DotProductWithPaddingGradient).NumInputs(3).NumOutputs(2);
501 
503  using GradientMakerBase::GradientMakerBase;
504  vector<OperatorDef> GetGradientDefs() override {
505  float pad_value = 0;
506  bool replicate = false;
507  if (ArgumentHelper::HasArgument(Def(), "pad_value")) {
508  pad_value = GetArgument(Def(), "pad_value").f();
509  }
510  if (ArgumentHelper::HasArgument(Def(), "replicate")) {
511  replicate = GetArgument(Def(), "replicate").i();
512  }
513 
514  const auto dot_arg =
515  vector<Argument>{MakeArgument<float>("pad_value", pad_value),
516  MakeArgument<bool>("replicate", replicate)};
517 
518  return SingleGradientDef(
519  "DotProductWithPaddingGradient",
520  "",
521  vector<string>{I(0), I(1), GO(0)},
522  vector<string>{GI(0), GI(1)},
523  dot_arg);
524  }
525 };
526 REGISTER_GRADIENT(DotProductWithPadding, GetDotProductWithPaddingGradient);
527 } // namespace caffe2
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
static vector< OperatorDef > SingleGradientDef(const Args &...args)
a helper function to allow one to create one single operator def, which is usually the case for many ...
std::function< struct Cost(const OperatorDef &, const vector< TensorShape > &)> CostInferenceFunctionType
Registers a function that takes in an OperatorDef and a series of input shapes and returns the total ...