Caffe2 - C++ API: caffe2/core/blob_serialization.h Source File

 #ifndef CAFFE2_CORE_BLOB_SERIALIZATION_H_
 #define CAFFE2_CORE_BLOB_SERIALIZATION_H_
 
 #include <limits>
 #include <future>
 
 #include <google/protobuf/repeated_field.h>
 
 #include "caffe2/core/blob.h"
 #include "caffe2/core/blob_serializer_base.h"
 #include "caffe2/core/tensor.h"
 #include "caffe2/core/typeid.h"
 #include "caffe2/core/types.h"
 #include "caffe2/utils/simple_queue.h"
 
 CAFFE2_DECLARE_int(caffe2_tensor_chunk_size);
 CAFFE2_DECLARE_int(caffe2_max_tensor_serializer_threads);
 CAFFE2_DECLARE_bool(caffe2_serialize_fp16_as_bytes);
 
 namespace caffe2 {
 
 constexpr auto kTensorBlobType = "Tensor";
 // String used to separate chunk id from the blob name when storing in DB
 constexpr auto kChunkIdSeparator = "#%";
 
 // The Blob serialization registry and serializer creator functions.
 CAFFE_DECLARE_TYPED_REGISTRY(
     BlobSerializerRegistry,
     CaffeTypeId,
     BlobSerializerBase,
     std::unique_ptr);
 #define REGISTER_BLOB_SERIALIZER(id, ...) \
   CAFFE_REGISTER_TYPED_CLASS(BlobSerializerRegistry, id, __VA_ARGS__)
 // Creates an operator with the given operator definition.
 inline unique_ptr<BlobSerializerBase> CreateSerializer(CaffeTypeId id) {
   return BlobSerializerRegistry()->Create(id);
 }
 
 template <class Context>
 class TensorSerializer : public BlobSerializerBase {
  public:
   TensorSerializer() : context_() {}
   ~TensorSerializer() override {}
   void Serialize(
       const Blob& blob,
       const string& name,
       SerializationAcceptor acceptor) override;
   void SerializeWithChunkSize(
       const Blob& blob,
       const string& name,
       SerializationAcceptor acceptor,
       int chunk_size) override;
 
   void Serialize(const Tensor<Context>& tensor, const string& name,
                  TensorProto* proto, size_t chunkBegin, int32_t chunkSize);
 
  private:
   // A utility function to store the device context detauls.
   void StoreDeviceDetail(const Tensor<Context>& input, TensorProto* proto);
   Context context_;
 };
 
 class BlobDeserializerBase {
  public:
   virtual ~BlobDeserializerBase() {}
 
   // Deserializes from a BlobProto object.
   virtual void Deserialize(const BlobProto& proto, Blob* blob) = 0;
 };
 
 CAFFE_DECLARE_REGISTRY(BlobDeserializerRegistry, BlobDeserializerBase);
 #define REGISTER_BLOB_DESERIALIZER(name, ...) \
   CAFFE_REGISTER_CLASS(BlobDeserializerRegistry, name, __VA_ARGS__)
 // Creates an operator with the given operator definition.
 inline unique_ptr<BlobDeserializerBase> CreateDeserializer(const string& type) {
   return BlobDeserializerRegistry()->Create(type);
 }
 
 template <class Context>
 class TensorDeserializer : public BlobDeserializerBase {
  public:
   void Deserialize(const BlobProto& proto, Blob* blob) override;
   void Deserialize(const TensorProto& proto, Tensor<Context>* tensor);
 };
 
 // Implementations
 
 namespace detail {
 template <typename SrcType, typename DstType, class Context>
 inline void CopyToProtoAsIs(
     const size_t size,
     const SrcType* src,
     google::protobuf::RepeatedField<DstType>* field,
     Context* context) {
   static_assert(
       sizeof(SrcType) == sizeof(DstType),
       "The source type and dest type cannot be copied as-is. Did "
       "you mean CopyToProtoWithCast?");
   field->Reserve(size);
   for (int i = 0; i < size; ++i) {
     field->Add(0);
   }
   context->template Copy<SrcType, Context, CPUContext>(
       size, src, reinterpret_cast<SrcType*>(field->mutable_data()));
   // Make sure that we finish the copy into the protobuf.
   context->FinishDeviceComputation();
 }
 
 template <typename SrcType, typename DstType, class Context>
 inline void CopyToProtoWithCast(
     const size_t size,
     const SrcType* src,
     google::protobuf::RepeatedField<DstType>* field,
     Context* context) {
   // TODO: we are having one unnecessary copy here if the context is already
   // CPUContext. Remove it if it is performance critical.
   unique_ptr<SrcType[]> buffer(new SrcType[size]);
   context->template Copy<SrcType, Context, CPUContext>(
       size, src, buffer.get());
   context->FinishDeviceComputation();
   field->Reserve(size);
   for (int i = 0; i < size; ++i) {
     field->Add(static_cast<DstType>(buffer[i]));
   }
 }
 
 template <typename SrcType, typename DstType, class Context>
 inline void CopyFromProtoAsIs(
     const size_t size,
     const google::protobuf::RepeatedField<SrcType>& field,
     DstType* dst,
     Context* context) {
   static_assert(
       sizeof(SrcType) == sizeof(DstType),
       "The source type and dest type cannot be copied as-is. Did "
       "you mean CopyFromProtoWithCast?");
   CAFFE_ENFORCE_EQ(size, field.size(), "Incorrect proto field size.");
   context->template Copy<DstType, CPUContext, Context>(
       size, reinterpret_cast<const DstType*>(field.data()), dst);
 }
 
 template <typename SrcType, typename DstType, class Context>
 inline void CopyFromProtoWithCast(
     const size_t size,
     const google::protobuf::RepeatedField<SrcType>& field,
     DstType* dst,
     Context* context) {
   CAFFE_ENFORCE_EQ(size, field.size(), "Incorrect proto field size.");
   // TODO: we are having one unnecessary copy here if the context is already
   // CPUContext. Remove it if it is performance critical.
   unique_ptr<DstType[]> buffer(new DstType[size]);
   const SrcType* src = field.data();
   for (int i = 0; i < size; ++i) {
     buffer[i] = static_cast<DstType>(src[i]);
   }
   context->template Copy<DstType, CPUContext, Context>(size, buffer.get(), dst);
 }
 
 }  // namespace detail
 
 template <class Context>
 void TensorSerializer<Context>::Serialize(
     const Blob& blob,
     const string& name,
     BlobSerializerBase::SerializationAcceptor acceptor) {
   this->SerializeWithChunkSize(blob, name, acceptor, kDefaultChunkSize);
 }
 
 template <class Context>
 void TensorSerializer<Context>::SerializeWithChunkSize(
     const Blob& blob,
     const string& name,
     BlobSerializerBase::SerializationAcceptor acceptor,
     int chunk_size) {
   CAFFE_ENFORCE(blob.IsType<Tensor<Context>>());
   const auto& tensor = blob.template Get<Tensor<Context>>();
   if (chunk_size == kNoChunking) {
     chunk_size = tensor.size() + 1; // to account for empty tensors
   } else if (chunk_size == kDefaultChunkSize) {
     chunk_size = FLAGS_caffe2_tensor_chunk_size;
   }
 
   auto processChunk = [&](int64_t chunkStart) {
     BlobProto blob_proto;
     blob_proto.set_name(name);
     blob_proto.set_type(kTensorBlobType);
     TensorProto& proto = *blob_proto.mutable_tensor();
     proto.set_name(name);
     this->Serialize(
         tensor, name, blob_proto.mutable_tensor(), chunkStart, chunk_size);
     acceptor(
         MakeString(name, kChunkIdSeparator, chunkStart / chunk_size),
         blob_proto.SerializeAsString());
   };
 
 #ifndef __ANDROID__
   std::vector<std::future<void>> futures;
   // Poorman's IOBound ThreadPool
   SimpleQueue<size_t> chunkQueue;
   auto task = [&]() {
     size_t chunkStart;
     while (chunkQueue.Pop(&chunkStart)) {
       processChunk(chunkStart);
     }
   };
   if (tensor.size() > chunk_size) {
     for (int i = 0; i < FLAGS_caffe2_max_tensor_serializer_threads; ++i) {
       futures.emplace_back(std::async(std::launch::async, task));
     }
   }
 #endif
 
   VLOG(1) << "Serializing blob " << name;
   // Serialize whole vector. If vector is empty, it's shape still needs to be
   // serialized in empty proto
   for (size_t chunkBegin = 0;
        chunkBegin < std::max(tensor.size(), static_cast<TIndex>(1));
        chunkBegin += chunk_size) {
     VLOG(2) << "Starting a chunk at " << chunkBegin;
 #ifndef __ANDROID__
     if (tensor.size() > chunk_size) {
       chunkQueue.Push(chunkBegin);
     } else {
       // Sync mode for small tensors
       processChunk(chunkBegin);
     }
 #else
     // Since Android does not have std::future, we will always do sync mode
     processChunk(chunkBegin);
 #endif
   }
 
 #ifndef __ANDROID__
   chunkQueue.NoMoreJobs();
   for (auto& fut : futures) {
     fut.get();
   }
 #endif
 }
 
 template <class Context>
 void TensorSerializer<Context>::Serialize(
     const Tensor<Context>& input,
     const string& /*name*/,
     TensorProto* proto_ptr,
     size_t chunkBegin,
     int32_t chunkSize) {
   CAFFE_ENFORCE(
       chunkBegin <= input.size(),
       "Chunk begin is out of tensor: ",
       chunkBegin,
       ' ',
       input.size());
   if (chunkBegin + chunkSize > input.size()) {
     chunkSize = input.size() - chunkBegin;
   }
 
   CAFFE_ENFORCE(
       input.raw_data() || chunkSize == 0,
       "The input does not have data input yet. This is probably because you "
       "created a tensor of non-zero shape but never filled its data via "
       "mutable_data() calls. This means that it makes no sense to serialize "
       "the tensor content.");
 
   TensorProto& proto = *proto_ptr;
   proto.mutable_segment()->set_begin(chunkBegin);
   proto.mutable_segment()->set_end(chunkBegin + chunkSize);
 
   for (int i = 0; i < input.ndim(); ++i) {
     proto.add_dims(input.dim(i));
   }
   const TensorProto::DataType data_type = TypeMetaToDataType(input.meta());
   proto.set_data_type(data_type);
   StoreDeviceDetail(input, &proto);
 
   // A lot of copypaste is error prone. Should we create a macro for this?
   switch (data_type) {
   case TensorProto_DataType_FLOAT:
     detail::CopyToProtoAsIs(
         chunkSize,
         input.template data<float>() + chunkBegin,
         proto.mutable_float_data(),
         &this->context_);
     break;
   case TensorProto_DataType_INT32:
     detail::CopyToProtoAsIs(
         chunkSize,
         input.template data<int>() + chunkBegin,
         proto.mutable_int32_data(),
         &this->context_);
     break;
   case TensorProto_DataType_BYTE:
     LOG(FATAL) << "This should not happen. When serializing, "
                   "BYTE is deprecated and moved to UINT8.";
     break;
   case TensorProto_DataType_STRING:
     {
       proto.mutable_string_data()->Reserve(chunkSize);
       const string* content = input.template data<string>();
       for (int i = chunkBegin; i < chunkBegin + chunkSize; ++i) {
         proto.add_string_data(content[i]);
       }
       break;
     }
   case TensorProto_DataType_BOOL:
     detail::CopyToProtoWithCast(
         chunkSize,
         input.template data<bool>() + chunkBegin,
         proto.mutable_int32_data(),
         &this->context_);
     break;
   case TensorProto_DataType_UINT8:
     detail::CopyToProtoWithCast(
         chunkSize,
         input.template data<uint8_t>() + chunkBegin,
         proto.mutable_int32_data(),
         &this->context_);
     break;
   case TensorProto_DataType_INT8:
     detail::CopyToProtoWithCast(
         chunkSize,
         input.template data<int8_t>() + chunkBegin,
         proto.mutable_int32_data(),
         &this->context_);
     break;
   case TensorProto_DataType_UINT16:
     detail::CopyToProtoWithCast(
         chunkSize,
         input.template data<uint16_t>() + chunkBegin,
         proto.mutable_int32_data(),
         &this->context_);
     break;
   case TensorProto_DataType_INT16:
     detail::CopyToProtoWithCast(
         chunkSize,
         input.template data<int16_t>() + chunkBegin,
         proto.mutable_int32_data(),
         &this->context_);
     break;
   case TensorProto_DataType_INT64:
     detail::CopyToProtoAsIs(
         chunkSize,
         input.template data<int64_t>() + chunkBegin,
         proto.mutable_int64_data(),
         &this->context_);
     break;
   case TensorProto_DataType_FLOAT16: {
     if (FLAGS_caffe2_serialize_fp16_as_bytes) {
       const int kValue = 1;
       CAFFE_ENFORCE_EQ(
           reinterpret_cast<const char*>(&kValue)[0],
           1,
           "Serialization of FLOAT16 on big endian platform "
           "is not written yet.");
       unique_ptr<char[]> buffer(new char[2 * chunkSize]);
       this->context_.template Copy<char, Context, CPUContext>(
           2 * chunkSize,
           reinterpret_cast<const char*>(
               input.template data<float16>() + chunkBegin),
           buffer.get());
       this->context_.FinishDeviceComputation();
       proto.set_byte_data(buffer.release(), 2 * chunkSize);
     } else {
       detail::CopyToProtoWithCast(
           chunkSize,
           reinterpret_cast<const uint16_t*>(input.template data<float16>()) +
               chunkBegin,
           proto.mutable_int32_data(),
           &this->context_);
     }
   } break;
   case TensorProto_DataType_DOUBLE:
     detail::CopyToProtoAsIs(
         chunkSize,
         input.template data<double>() + chunkBegin,
         proto.mutable_double_data(),
         &this->context_);
     break;
   case TensorProto_DataType_UNDEFINED: {
     proto.mutable_string_data()->Reserve(chunkSize);
     Blob temp_blob;
     const char* raw_data = static_cast<const char*>(input.raw_data());
     for (int i = chunkBegin; i < chunkBegin + chunkSize; ++i) {
       temp_blob.ShareExternal(
           const_cast<char*>(raw_data + i * input.itemsize()), input.meta());
       proto.add_string_data(temp_blob.Serialize(""));
     }
   } break;
     // Note: we intentially do not provide "default:" so if any new data types
     // are added, the compiler should warn the user to add the case here.
   }
 }
 
 template <class Context>
 void TensorDeserializer<Context>::Deserialize(
     const BlobProto& blob_proto,
     Blob* blob) {
   Deserialize(blob_proto.tensor(), blob->GetMutable<Tensor<Context>>());
 }
 
 template <class Context>
 void TensorDeserializer<Context>::Deserialize(
     const TensorProto& proto,
     Tensor<Context>* tensor) {
   // We create a local context for deserializing. Since Caffe2 contexts are
   // usually lightweighted, this should not involve too much overhead.
   Context context(proto.device_detail());
   context.SwitchToDevice(0);
   vector<TIndex> dims;
   for (const TIndex d : proto.dims()) {
     dims.push_back(d);
   }
   tensor->Resize(dims);
 
   int64_t chunkBegin = 0;
   auto chunkEnd = tensor->size();
   if (proto.has_segment()) {
     chunkBegin = proto.segment().begin();
     chunkEnd = proto.segment().end();
   }
   CAFFE_ENFORCE(
       0 <= chunkBegin && chunkBegin <= chunkEnd && chunkEnd <= tensor->size(),
       "Invalid chunk ",
       chunkBegin,
       ' ',
       chunkEnd,
       " with total tensor size ",
       tensor->size());
   auto chunkSize = chunkEnd - chunkBegin;
 
   switch (proto.data_type()) {
     case TensorProto_DataType_FLOAT:
       detail::CopyFromProtoAsIs(
           chunkSize,
           proto.float_data(),
           tensor->template mutable_data<float>() + chunkBegin,
           &context);
       break;
     case TensorProto_DataType_INT32:
       detail::CopyFromProtoAsIs(
           chunkSize,
           proto.int32_data(),
           tensor->template mutable_data<int>() + chunkBegin,
           &context);
       break;
     case TensorProto_DataType_BYTE:
       // Since BYTE stores the data in a string field instead of a repreated
       // field we will have it special cased.
       CAFFE_ENFORCE_EQ(
           chunkSize, proto.byte_data().size(), "Incorrect proto field size.");
       context.template Copy<uint8_t, Context, CPUContext>(
           chunkSize,
           reinterpret_cast<const uint8_t*>(proto.byte_data().data()),
           tensor->template mutable_data<uint8_t>() + chunkBegin);
       break;
     case TensorProto_DataType_STRING:
       // Special handing of string because it is a non-fundamental type.
       {
         string* content = tensor->template mutable_data<string>();
         for (int i = 0; i < chunkSize; ++i) {
           content[i + chunkBegin] = proto.string_data(i);
         }
       }
       break;
     case TensorProto_DataType_BOOL:
       detail::CopyFromProtoWithCast(
           chunkSize,
           proto.int32_data(),
           tensor->template mutable_data<bool>() + chunkBegin,
           &context);
       break;
     case TensorProto_DataType_UINT8:
       detail::CopyFromProtoWithCast(
           chunkSize,
           proto.int32_data(),
           tensor->template mutable_data<uint8_t>() + chunkBegin,
           &context);
       break;
     case TensorProto_DataType_INT8:
       detail::CopyFromProtoWithCast(
           chunkSize,
           proto.int32_data(),
           tensor->template mutable_data<int8_t>() + chunkBegin,
           &context);
       break;
     case TensorProto_DataType_UINT16:
       detail::CopyFromProtoWithCast(
           chunkSize,
           proto.int32_data(),
           tensor->template mutable_data<uint16_t>() + chunkBegin,
           &context);
       break;
     case TensorProto_DataType_INT16:
       detail::CopyFromProtoWithCast(
           chunkSize,
           proto.int32_data(),
           tensor->template mutable_data<int16_t>() + chunkBegin,
           &context);
       break;
     case TensorProto_DataType_INT64:
       detail::CopyFromProtoAsIs(
           chunkSize,
           proto.int64_data(),
           tensor->template mutable_data<int64_t>() + chunkBegin,
           &context);
       break;
     case TensorProto_DataType_FLOAT16:
       if (proto.has_byte_data()) {
         const int kValue = 1;
         CAFFE_ENFORCE_EQ(
             reinterpret_cast<const char*>(&kValue)[0],
             1,
             "Serialization of FLOAT16 on big endian platform "
             "is not written yet.");
         CAFFE_ENFORCE_EQ(
             2 * chunkSize,
             proto.byte_data().size(),
             "Incorrect proto field size.");
         context.template Copy<float16, Context, CPUContext>(
             chunkSize,
             reinterpret_cast<const float16*>(proto.byte_data().data()),
             tensor->template mutable_data<float16>() + chunkBegin);
       } else {
         // Backward compatibility with models which used int32_data field
         detail::CopyFromProtoWithCast(
             chunkSize,
             proto.int32_data(),
             reinterpret_cast<uint16_t*>(
                 tensor->template mutable_data<float16>()) +
                 chunkBegin,
             &context);
       }
       break;
     case TensorProto_DataType_DOUBLE:
       detail::CopyFromProtoAsIs(
           chunkSize,
           proto.double_data(),
           tensor->template mutable_data<double>() + chunkBegin,
           &context);
       break;
     case TensorProto_DataType_UNDEFINED: {
       Blob temp_blob;
       void* raw_ptr = nullptr;
       for (int i = 0; i < chunkSize; ++i) {
         temp_blob.Deserialize(proto.string_data(i));
         if (i == 0) {
           raw_ptr = tensor->raw_mutable_data(temp_blob.meta());
         }
         temp_blob.meta().copy()(
             temp_blob.GetRaw(),
             static_cast<char*>(raw_ptr) +
                 (i + chunkBegin) * temp_blob.meta().itemsize(),
             1);
       }
     }
   }
   context.FinishDeviceComputation();
 }
 
 }  // namespace caffe2
 
 #endif  // CAFFE2_CORE_BLOB_SERIALIZATION_H_
Facebook Open Source