1 #include "caffe2/operators/h_softmax_op.h" 9 float HSoftmaxOp<float, CPUContext>::RunForwardSingle(
const float* X,
10 const float* W,
const float* b,
int target,
float* int_output,
11 const float* bias_multiplier,
int dim_out,
int dim_in,
12 int& int_output_offset) {
15 float* fc_output_data = int_output + int_output_offset;
17 math::Gemm<float, CPUContext>(CblasNoTrans, CblasTrans, 1, dim_out, dim_in, 1,
18 X, W, 0, fc_output_data, &context_);
19 math::Gemv<float, CPUContext>(CblasNoTrans, dim_out, 1, 1,
20 b, bias_multiplier, 1, fc_output_data, &context_);
22 int_output_offset += dim_out;
25 float* softmax_output_data = int_output + int_output_offset;
27 if (scale_.size() != 1) {
30 if (sum_multiplier_.size() != dim_out) {
31 sum_multiplier_.Resize(dim_out);
32 math::Set<float, CPUContext>(dim_out, 1.f,
33 sum_multiplier_.mutable_data<
float>(), &context_);
35 math::RowwiseMax<float, CPUContext>(1, dim_out, fc_output_data,
36 scale_.mutable_data<
float>(), &context_);
39 context_.template Copy<float, CPUContext, CPUContext>(dim_out, fc_output_data,
42 math::Gemv<float, CPUContext>(CblasNoTrans, dim_out, 1, -1,
43 sum_multiplier_.data<
float>(), scale_.data<
float>(), 1, softmax_output_data,
47 math::Exp<float, CPUContext>(dim_out, softmax_output_data,
48 softmax_output_data, &context_);
49 math::Gemv<float, CPUContext>(CblasNoTrans, 1, dim_out, 1,
50 softmax_output_data, sum_multiplier_.data<
float>(), 0,
51 scale_.mutable_data<
float>(), &context_);
54 const float scale = *scale_.data<
float>();
55 for (
int j = 0; j < dim_out; ++j) {
56 softmax_output_data[j] /= scale;
59 int_output_offset += dim_out;
65 return -log(std::max(softmax_output_data[target], kLOG_THRESHOLD()));
70 bool HSoftmaxOp<float, CPUContext>::RunOnDevice() {
72 const auto& W = Input(1);
73 const auto& b = Input(2);
74 auto& label = Input(3);
76 auto* intermediate_output = Output(1);
79 int M = X.ndim() > 1 ? X.dim32(0) : 1;
82 CAFFE_ENFORCE_GE(W.ndim(), 2);
83 CAFFE_ENFORCE_EQ(b.ndim(), 1);
84 CAFFE_ENFORCE_EQ(K, W.size() / (W.dim32(0)));
87 CAFFE_ENFORCE_EQ(N, b.dim32(0));
89 auto* Ydata = Y->mutable_data<
float>();
90 math::Set<float, CPUContext>(M, 0.f, Ydata, &context_);
91 const auto* labeldata = label.data<
int>();
93 auto hierarchy = getHierarchyForLabels(M, labeldata, hierarchy_all_map_);
94 int int_output_size = getIntermediateOutputSize(labeldata, M, hierarchy);
95 intermediate_output->Resize(int_output_size);
96 float * int_output_data = intermediate_output->mutable_data<
float>();
97 int int_output_offset = 0;
99 if (bias_multiplier_.size() != M) {
100 bias_multiplier_.Resize(M);
101 math::Set<float, CPUContext>(M,
static_cast<float>(1),
102 bias_multiplier_.mutable_data<
float>(), &context_);
105 for (
int sample = 0; sample < M; ++sample) {
106 int word_id = labeldata[sample];
107 const PathProto& path = hierarchy[word_id];
108 for (
const PathNodeProto& node : path.path_nodes()) {
110 int w_offset = node.index();
112 int w_length = node.length();
113 int target = node.target();
115 Ydata[sample] += RunForwardSingle(X.data<
float>() + sample*K,
116 W.data<
float>() + w_offset*K, b.data<
float>() + w_offset, target,
117 int_output_data, bias_multiplier_.data<
float>()+sample, w_length, K,
125 void HSoftmaxGradientOp<float, CPUContext>::RunBackwardSingle(
const float* X,
126 const float* dY,
const float* W,
int target,
127 const float* int_output,
float* dX,
float* dW,
float* db,
float* dint_output,
128 int dim_in,
int dim_out,
int& int_output_offset) {
132 float* dX_entropy = dint_output + int_output_offset - dim_out;
134 const float* X_entropy = int_output + int_output_offset - dim_out;
136 math::Set<float, CPUContext>(dim_out, 0.f, dX_entropy, &context_);
137 dX_entropy[target] = - (*dY) / std::max(X_entropy[target], kLOG_THRESHOLD());
139 int_output_offset -= dim_out;
142 if (scale_.size() != 1) {
145 float* scaledata = scale_.mutable_data<
float>();
147 if (sum_multiplier_.size() != dim_out) {
148 sum_multiplier_.Resize(dim_out);
149 math::Set<float, CPUContext>(dim_out, 1.f,
150 sum_multiplier_.mutable_data<
float>(), &context_);
153 float* dX_softmax = dint_output + int_output_offset - dim_out;
154 context_.Copy<float, CPUContext, CPUContext>(dim_out, dX_entropy, dX_softmax);
156 math::Dot<float, CPUContext>(dim_out, X_entropy, dX_entropy, scaledata,
158 math::Gemv<float, CPUContext>(CblasTrans, 1, dim_out, -1,
159 sum_multiplier_.data<
float>(), scaledata , 1, dX_softmax, &context_);
160 math::Mul<float, CPUContext>(dim_out, dX_softmax, X_entropy, dX_softmax,
163 int_output_offset -= dim_out;
166 if (bias_multiplier_.size() != 1) {
169 bias_multiplier_.Resize(1);
170 math::Set<float, CPUContext>(1,
static_cast<float>(1),
171 bias_multiplier_.template mutable_data<float>(), &context_);
176 math::Gemm<float, CPUContext>(CblasTrans, CblasNoTrans, dim_out, dim_in, 1, 1,
177 dX_softmax, X, 1, dW, &context_);
181 math::Gemv<float, CPUContext>(CblasTrans, 1, dim_out, 1, dX_softmax,
182 bias_multiplier_.template data<float>(), 1, db, &context_);
186 math::Gemv<float, CPUContext>(CblasTrans, dim_out, dim_in,
187 1, W, dX_softmax, 1, dX, &context_);
192 bool HSoftmaxGradientOp<float, CPUContext>::RunOnDevice() {
194 const auto& W = Input(1);
195 const auto& b = Input(2);
196 auto& label = Input(3);
197 auto& intermediate_output = Input(4);
199 auto* dX = Output(0);
200 auto* dW = Output(1);
201 auto* db = Output(2);
202 auto* dX_intermediate_output = Output(3);
206 dX_intermediate_output->ResizeLike(intermediate_output);
208 float* dX_data = dX->mutable_data<
float>();
209 float* dW_data = dW->mutable_data<
float>();
210 float* db_data = db->mutable_data<
float>();
211 float* dOutput_data = dX_intermediate_output->mutable_data<
float>();
213 math::Set<float, CPUContext>(X.size(), 0.f, dX_data, &context_);
214 math::Set<float, CPUContext>(W.size(), 0.f, dW_data, &context_);
215 math::Set<float, CPUContext>(b.size(), 0.f, db_data, &context_);
216 math::Set<float, CPUContext>(intermediate_output.size(), 0.f, dOutput_data,
220 int M = X.ndim() > 1 ? X.dim32(0) : 1;
222 int K = X.size() / M;
223 const auto* labeldata = label.data<
int>();
225 auto hierarchy = getHierarchyForLabels(M, labeldata, hierarchy_all_map_);
226 int output_offset = getIntermediateOutputSize(labeldata, M, hierarchy);
230 for (
int sample = M-1; sample >= 0; sample--) {
231 int word_id = labeldata[sample];
232 PathProto path = hierarchy[word_id];
233 for (
auto node = path.path_nodes().rbegin();
234 node != path.path_nodes().rend(); node++) {
235 int w_offset = node->index();
236 int w_length = node->length();
237 int target = node->target();
238 RunBackwardSingle(X.data<
float>() + sample*K, dY.data<
float>() + sample,
239 W.data<
float>() + w_offset*K, target, intermediate_output.data<
float>(),
240 dX_data + sample*K, dW_data + w_offset*K, db_data + w_offset,
241 dOutput_data, K, w_length, output_offset);
249 bool HSoftmaxSearchOp<float, CPUContext>::pruning(
255 const NodeProto& src_node,
259 int w_length = src_node.children_size() + src_node.word_ids_size();
261 intermediate_data.Resize(2 * w_length);
262 float* int_output_data = intermediate_data.template mutable_data<float>();
263 int int_output_offset = 0;
264 int w_offset = src_node.offset();
272 bias_multiplier_.template data<float>() + sample,
277 float* softmax_output_data = int_output_data + w_length;
279 for (
int i = 0; i < w_length; i++) {
280 softmax_output_data[i] =
281 -log(std::max(softmax_output_data[i], kLOG_THRESHOLD())) + parent_score;
283 for (
int i = 0; i < src_node.children_size(); i++) {
284 if (softmax_output_data[i] < parent_score + beam) {
285 dst_node.add_children();
286 int idx = dst_node.children_size() - 1;
288 src_node.children(i).has_offset(),
289 "HSM Search require the field offset in NodeProte");
290 dst_node.mutable_children(idx)->set_offset(src_node.children(i).offset());
292 src_node.children(i).has_name(),
293 "HSM Search require the field name in NodeProte");
294 dst_node.mutable_children(idx)->set_name(src_node.children(i).name());
295 dst_node.add_scores(softmax_output_data[i]);
302 src_node.children(i),
303 *dst_node.mutable_children(idx),
304 softmax_output_data[i],
309 for (
int i = src_node.children_size(); i < w_length; i++) {
310 if (softmax_output_data[i] < parent_score + beam) {
311 dst_node.add_word_ids(src_node.word_ids(i - src_node.children_size()));
312 dst_node.add_scores(softmax_output_data[i]);
320 bool HSoftmaxSearchOp<float, CPUContext>::extractNodes(
321 const NodeProto& node,
322 std::vector<std::pair<string, float>>& info) {
325 for (
const auto& n : node.children()) {
326 info.emplace_back(std::make_pair(n.name(), node.scores(i++)));
328 for (
const int n : node.word_ids()) {
329 info.emplace_back(std::make_pair(caffe2::to_string(n), node.scores(i++)));
332 for (
const auto& n : node.children()) {
333 extractNodes(n, info);
340 bool HSoftmaxSearchOp<float, CPUContext>::RunOnDevice() {
342 const auto& W = Input(1);
343 const auto& b = Input(2);
344 auto* Y_names = Output(0);
345 auto* Y_scores = Output(1);
347 int M = X.ndim() > 1 ? X.dim32(0) : 1;
349 int K = X.size() / M;
350 CAFFE_ENFORCE(W.ndim() == 2,
"Weight must be a matrix.");
351 CAFFE_ENFORCE(b.ndim() == 1,
"Bias must be a vector.");
352 CAFFE_ENFORCE(K == W.size() / (W.dim32(0)),
"feature dimension mismatch.");
355 CAFFE_ENFORCE(N == b.dim32(0),
"mismatch between Weight and Bias.");
356 Y_names->Resize(M, top_n_);
357 Y_scores->Resize(M, top_n_);
359 if (bias_multiplier_.size() != M) {
360 bias_multiplier_.Resize(M);
361 math::Set<float, CPUContext>(
363 static_cast<float>(1),
364 bias_multiplier_.mutable_data<
float>(),
368 for (
int sample = 0; sample < M; ++sample) {
370 tree_.root_node().has_offset(),
371 "HSM Search require the field offset in NodeProte");
373 tree_.root_node().has_name(),
374 "HSM Search require the field name in NodeProte");
377 dst_node.set_offset(tree_.root_node().offset());
378 dst_node.set_name(tree_.root_node().name());
391 std::vector<std::pair<string, float>> info;
392 extractNodes(dst_node, info);
396 info.begin() + (top_n_ < info.size() ? top_n_ : info.size() - 1),
398 [&](std::pair<string, float> a, std::pair<string, float> b) {
399 return a.second < b.second;
401 auto* y_name_data = Y_names->mutable_data<
string>() + sample * top_n_;
402 auto* y_score_data = Y_scores->mutable_data<
float>() + sample * top_n_;
403 for (
int i = 0; i < top_n_; i++) {
404 if (i < info.size()) {
405 y_name_data[i] = info[i].first;
406 y_score_data[i] = info[i].second;
416 template <
typename T,
class Context>
417 bool HuffmanTreeHierarchyOp<T, Context>::RunOnDevice() {
418 const auto& Y = Input(0);
419 auto treeOutput = Output(0);
420 CAFFE_ENFORCE_EQ(Y.ndim(), 1,
"Input labels must be a vector.");
421 const auto y_data = Y.template data<T>();
422 treeOutput->Resize(1);
423 std::vector<int> labelCounts;
424 labelCounts.resize(num_classes_, 0);
425 for (
int i = 0; i < Y.dim32(0); ++i) {
427 const int label_index = y_data[i];
431 "Found an input label ",
438 labelCounts[label_index]++;
441 std::priority_queue<Node, std::vector<Node>, NodeComparator> nodes;
442 std::vector<Node> huffmanTree;
443 std::vector<int> labelIndices;
444 labelIndices.resize(num_classes_);
446 int current_node_index = 0;
447 for (
int i = 0; i < num_classes_; ++i) {
448 Node node(i, labelCounts[i]);
453 auto get_next_node = [&nodes, &huffmanTree, &labelIndices]() {
454 auto node = nodes.top();
455 int node_index = huffmanTree.size();
456 if (node.label != -1) {
457 labelIndices[node.label] = node_index;
460 huffmanTree.push_back(node);
461 return std::pair<int, Node>(node_index, node);
465 auto merge_nodes = [&nodes](
466 const std::pair<int, Node>& node_l,
const std::pair<int, Node>& node_r) {
467 Node node(-1, node_l.second.count + node_r.second.count);
468 node.left_ch_index = node_l.first;
469 node.right_ch_index = node_r.first;
474 while (!nodes.empty()) {
475 auto lNode = get_next_node();
476 if (!nodes.empty()) {
477 auto rNode = get_next_node();
478 merge_nodes(lNode, rNode);
482 auto is_leaf_node = [&huffmanTree](
const int node_index) {
483 return huffmanTree[node_index].left_ch_index == -1 &&
484 huffmanTree[node_index].right_ch_index == -1;
487 auto get_node_label = [&huffmanTree](
const int node_index) {
488 return huffmanTree[node_index].label;
492 int current_offset = 0;
493 std::function<void(int, NodeProto*)> build_tree = [&](
494 const int node_index, NodeProto* node) {
495 if (is_leaf_node(node_index) || node_index == -1) {
498 const int left_ch_index = huffmanTree[node_index].left_ch_index;
499 const int right_ch_index = huffmanTree[node_index].right_ch_index;
500 if (left_ch_index != -1) {
501 if (is_leaf_node(left_ch_index)) {
502 node->add_word_ids(get_node_label(left_ch_index));
504 auto* ch_node = node->add_children();
505 ch_node->set_offset(current_offset);
507 build_tree(left_ch_index, ch_node);
510 if (right_ch_index != -1) {
511 if (is_leaf_node(right_ch_index)) {
512 node->add_word_ids(get_node_label(right_ch_index));
515 auto* ch_node = node->add_children();
516 ch_node->set_offset(current_offset);
518 build_tree(right_ch_index, ch_node);
524 const int rootNodeIndex = huffmanTree.size() - 1;
526 rootNode.set_offset(current_offset);
528 build_tree(rootNodeIndex, &rootNode);
530 *treeProto.mutable_root_node() = rootNode;
532 treeProto.SerializeToString(treeOutput->template mutable_data<string>());
537 REGISTER_CPU_OPERATOR(HSoftmax, HSoftmaxOp<float, CPUContext>);
538 REGISTER_CPU_OPERATOR(HSoftmaxGradient,
539 HSoftmaxGradientOp<float, CPUContext>);
540 REGISTER_CPU_OPERATOR(HSoftmaxSearch, HSoftmaxSearchOp<float, CPUContext>);
541 REGISTER_CPU_OPERATOR(
542 HuffmanTreeHierarchy,
543 HuffmanTreeHierarchyOp<int64_t, CPUContext>);
545 OPERATOR_SCHEMA(HSoftmax)
549 Hierarchical softmax is an operator which approximates the softmax operator 550 while giving significant training speed gains and reasonably comparable 551 performance. In this operator, instead of calculating the probabilities of all 552 the classes, we calculate the probability of each step in the path from root to 553 the target word in the hierarchy. 555 The operator takes a 2-D tensor (Tensor<float>) containing a batch of layers, a 556 set of parameters represented by the weight matrix and bias terms, and a 1-D 557 tensor (Tensor<int>) holding labels, or the indices of the target class. The 558 hierarchy has to be specified as an argument to the operator. 560 The operator returns a 1-D tensor holding the computed log probability of the 561 target class and a 2-D tensor of intermediate outputs (from the weight matrix 562 and softmax from each step in the path from root to target class) which will be 563 used by the gradient operator to compute gradients for all samples in the batch. 565 .Arg("hierarchy",
"Serialized HierarchyProto string containing list of " 566 "vocabulary words and their paths from root of hierarchy to the leaf")
567 .Input(0,
"X",
"Input data from previous layer")
568 .Input(1,
"W",
"2D blob containing 'stacked' fully connected weight " 569 "matrices. Each node in the hierarchy contributes one FC weight matrix if " 570 "it has children nodes. Dimension is N*D, D is input dimension of data (X), " 571 "N is sum of all output dimensions, or total number of nodes (excl root)")
572 .Input(2,
"b",
"1D blob with N parameters")
573 .Input(3,
"labels",
"int word_id of the target word")
574 .Output(0,
"Y",
"1-D of log probability outputs, one per sample")
575 .Output(1,
"intermediate_output",
"Extra blob to store the intermediate " 576 "FC and softmax outputs for each node in the hierarchical path of a word. " 577 "The outputs from samples are stored in consecutive blocks in the forward " 578 "pass and are used in reverse order in the backward gradientOp pass");
580 OPERATOR_SCHEMA(HSoftmaxGradient).NumInputs(6).NumOutputs(4);
582 class GetHSoftmaxGradient :
public GradientMakerBase {
583 using GradientMakerBase::GradientMakerBase;
584 vector<OperatorDef> GetGradientDefs()
override {
585 return SingleGradientDef(
586 "HSoftmaxGradient",
"",
588 vector<string>{I(0), I(1), I(2), I(3), O(1), GO(0)},
590 vector<string>{GI(0), GI(1), GI(2), GO(1)});
593 REGISTER_GRADIENT(HSoftmax, GetHSoftmaxGradient);
595 OPERATOR_SCHEMA(HSoftmaxSearch)
599 HSoftmaxSearch is an operator to generate the most possible paths given a 600 well-trained model and input vector. Greedy algorithm is used for pruning the 605 "Serialized TreeProto string containing a tree " 606 "including all intermidate nodes and leafs. All nodes must have names " 607 "for correct outputs")
610 "beam used for pruning tree. The pruning algorithm is that " 611 "only children, whose score is smaller than parent's score puls beam, " 612 "will be propagated. ")
613 .Arg(
"topN",
"Number of nodes in outputs")
614 .Input(0,
"X",
"Input data from previous layer")
615 .Input(1,
"W",
"The matrix trained from Softmax Ops")
616 .Input(2,
"b",
"The bias traiend from Softmax Ops")
620 "The name of selected nodes and leafs. " 621 "For nodes, it will be the name defined in the tree. " 622 "For leafs, it will be the index of the word in the tree.")
623 .Output(1,
"Y_scores",
"The corresponding scores of Y_names");
624 SHOULD_NOT_DO_GRADIENT(HSoftmaxSearch);
626 OPERATOR_SCHEMA(HuffmanTreeHierarchy)
630 HuffmanTreeHierarchy is an operator to generate huffman tree hierarchy given 631 the input labels. It returns the tree as seralized HierarchyProto 633 .Arg("num_classes",
"The number of classes used to build the hierarchy.")
634 .Input(0,
"Labels",
"The labels vector")
635 .Output(0,
"Hierarch",
"Huffman coding hierarchy of the labels");
637 SHOULD_NOT_DO_GRADIENT(HuffmanTreeHierarchyOp);
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...