A global dictionary that holds information about what Caffe2 modules have been loaded in the current runtime, and also utility functions to load modules. More...
Namespaces | |
enforce_detail | |
Rich logging messages. | |
Typedefs | |
using | MemoryDeleter = void(*)(void *) |
typedef int64_t | TIndex |
template<typename Key , typename Value > | |
using | CaffeMap = std::map< Key, Value > |
typedef Tensor< CUDAContext > | TensorCUDA |
typedef void(* | EventCreateFunction) (const DeviceOption &option, Event *) |
typedef void(* | EventRecordFunction) (Event *, const void *, const char *) |
typedef void(* | EventWaitFunction) (const Event *, void *) |
typedef void(* | EventFinishFunction) (const Event *) |
typedef EventStatus(* | EventQueryFunction) (const Event *) |
typedef const std::string &(* | EventErrorMessageFunction) (const Event *) |
typedef void(* | EventSetFinishedFunction) (const Event *, const char *) |
typedef void(* | EventResetFunction) (Event *) |
typedef ObserverBase< NetBase > | NetObserver |
typedef std::function< std::unique_ptr< NetObserver >NetBase *)> | NetObserverCreator |
typedef ObserverBase< OperatorBase > | OperatorObserver |
typedef Registry< std::string, std::unique_ptr< OperatorBase >, const OperatorDef &, Workspace * > *(* | RegistryFunction) () |
using | EnginePrefType = std::vector< std::string > |
using | PerOpEnginePrefType = CaffeMap< int, CaffeMap< std::string, EnginePrefType >> |
using | GlobalEnginePrefType = CaffeMap< int, EnginePrefType > |
typedef std::function< bool(int)> | ShouldContinue |
using | ExportedStatList = std::vector< ExportedStatValue > |
Holds names and values of counters exported from a StatRegistry. | |
using | ExportedStatMap = std::unordered_map< std::string, int64_t > |
typedef Tensor< CPUContext > | TensorCPU |
typedef TypeMeta(* | TypeCall) (const void *) |
typedef vector< TIndex >(* | TensorInfoCall) (const void *, bool *shares_data, size_t *capacity, DeviceOption *device) |
typedef intptr_t | CaffeTypeId |
typedef half_float::half | half |
typedef half | DataType |
template<typename T > | |
using | deleted_unique_ptr = std::unique_ptr< T, std::function< void(T *)>> |
using | ParallelFor = std::function< void(size_t, std::function< void(size_t)>)> |
using | NumericTypes = TensorTypes< int32_t, int64_t, float, double > |
using | IntTypes = TensorTypes< int32_t, int64_t > |
using | BoolTypes = TensorTypes< bool > |
using | IntBoolTypes = TensorTypes< int32_t, int64_t, bool > |
template<typename InputTypes , class Context , class Functor , class OutputType = SameTypeAsInput> | |
using | UnaryElementwiseOp = UnaryElementwiseWithArgsOp< InputTypes, Context, WithDefaultConstructor< Functor >, OutputType > |
UnaryElementwiseOp is a wrapper around UnaryElementwiseWithArgsOp, with the difference that it takes a functor with default constructor, e.g. More... | |
using | n = 2 |
using | MapType64To64 = MapTypeTraits< int64_t, int64_t >::MapType |
using | MapType64To32 = MapTypeTraits< int64_t, int32_t >::MapType |
using | MapType32To32 = MapTypeTraits< int32_t, int32_t >::MapType |
using | MapType32To64 = MapTypeTraits< int32_t, int64_t >::MapType |
template<typename ScalarFunctor , typename TypeMap = FixedType<std::string>> | |
using | StringElementwiseOp = UnaryElementwiseWithArgsOp< TensorTypes< std::string >, CPUContext, ForEach< ScalarFunctor >, TypeMap > |
using | RebatchingQueuePtr = std::unique_ptr< RebatchingQueue > |
template<typename T > | |
using | EArrXt = Eigen::Array< T, Eigen::Dynamic, 1 > |
using | EArrXf = Eigen::ArrayXf |
using | EArrXd = Eigen::ArrayXd |
using | EArrXi = Eigen::ArrayXi |
using | EArrXb = EArrXt< bool > |
template<typename T > | |
using | EArrXXt = Eigen::Array< T, Eigen::Dynamic, Eigen::Dynamic > |
using | EArrXXf = Eigen::ArrayXXf |
template<typename T > | |
using | ERArrXXt = Eigen::Array< T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor > |
using | ERArrXXf = ERArrXXt< float > |
template<typename T > | |
using | EVecXt = Eigen::Matrix< T, Eigen::Dynamic, 1 > |
using | EVecXd = Eigen::VectorXd |
using | EVecXf = Eigen::VectorXf |
using | ERVecXd = Eigen::RowVectorXd |
using | ERVecXf = Eigen::RowVectorXf |
template<typename T > | |
using | EMatXt = Eigen::Matrix< T, Eigen::Dynamic, Eigen::Dynamic > |
using | EMatXd = Eigen::MatrixXd |
using | EMatXf = Eigen::MatrixXf |
template<typename T > | |
using | ERMatXt = Eigen::Matrix< T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor > |
using | ERMatXd = ERMatXt< double > |
using | ERMatXf = ERMatXt< float > |
template<typename T > | |
using | EigenMatrixMap = Eigen::Map< Eigen::Matrix< T, Eigen::Dynamic, Eigen::Dynamic > > |
template<typename T > | |
using | EigenArrayMap = Eigen::Map< Eigen::Array< T, Eigen::Dynamic, Eigen::Dynamic > > |
template<typename T > | |
using | EigenVectorMap = Eigen::Map< Eigen::Matrix< T, Eigen::Dynamic, 1 > > |
template<typename T > | |
using | EigenVectorArrayMap = Eigen::Map< Eigen::Array< T, Eigen::Dynamic, 1 > > |
template<typename T > | |
using | ConstEigenMatrixMap = Eigen::Map< const Eigen::Matrix< T, Eigen::Dynamic, Eigen::Dynamic > > |
template<typename T > | |
using | ConstEigenArrayMap = Eigen::Map< const Eigen::Array< T, Eigen::Dynamic, Eigen::Dynamic > > |
template<typename T > | |
using | ConstEigenVectorMap = Eigen::Map< const Eigen::Matrix< T, Eigen::Dynamic, 1 > > |
template<typename T > | |
using | ConstEigenVectorArrayMap = Eigen::Map< const Eigen::Array< T, Eigen::Dynamic, 1 > > |
Functions | |
void | ConvertToRawDataset (const string &input_db_name, const string &output_db_name) |
void | ReadImage (std::ifstream *file, int *label, char *buffer) |
void | WriteToDB (const string &filename, const int num_items, const int &offset, db::DB *db) |
void | ConvertCIFAR () |
void | ConvertImageDataset (const string &input_folder, const string &list_filename, const string &output_db_name, const bool) |
uint32_t | swap_endian (uint32_t val) |
void | convert_dataset (const char *image_filename, const char *label_filename, const char *db_path, const int data_limit) |
void | run () |
void | NoDelete (void *) |
CPUAllocator * | GetCPUAllocator () |
void | SetCPUAllocator (CPUAllocator *alloc) |
void | swap (Blob &lhs, Blob &rhs) |
CAFFE_DEFINE_TYPED_REGISTRY (BlobSerializerRegistry, CaffeTypeId, BlobSerializerBase, std::unique_ptr) | |
CAFFE_DEFINE_REGISTRY (BlobDeserializerRegistry, BlobDeserializerBase) | |
CAFFE_DECLARE_TYPED_REGISTRY (BlobSerializerRegistry, CaffeTypeId, BlobSerializerBase, std::unique_ptr) | |
unique_ptr< BlobSerializerBase > | CreateSerializer (CaffeTypeId id) |
CAFFE_DECLARE_REGISTRY (BlobDeserializerRegistry, BlobDeserializerBase) | |
unique_ptr< BlobDeserializerBase > | CreateDeserializer (const string &type) |
bool | HasCudaRuntime () |
const std::map< string, string > & | GetBuildOptions () |
template<typename T , typename... Args> | |
std::enable_if<!std::is_array< T >::value, std::unique_ptr< T > >::type | make_unique (Args &&...args) |
template<typename T > | |
std::enable_if< std::is_array< T >::value, std::unique_ptr< T > >::type | make_unique (const size_t n) |
template<typename T , typename... Args> | |
std::enable_if< std::extent< T >::value!=0, std::unique_ptr< T > >::type | make_unique (Args &&...)=delete |
template<typename Dst , typename Src > | |
Dst | dynamic_cast_if_rtti (Src ptr) |
size_t | cudnnCompiledVersion () |
size_t | cudnnRuntimeVersion () |
void | CheckCuDNNVersions () |
cudnnTensorFormat_t | GetCudnnTensorFormat (const StorageOrder &order) |
A wrapper function to convert the Caffe storage order to cudnn storage order enum values. | |
int | NumCudaDevices () |
Returns the number of devices. | |
void | SetDefaultGPUID (const int deviceid) |
int | GetDefaultGPUID () |
int | CaffeCudaGetDevice () |
Gets the current GPU id. More... | |
void | CaffeCudaSetDevice (const int id) |
Gets the current GPU id. More... | |
int | GetGPUIDForPointer (const void *ptr) |
Gets the GPU id that the current pointer is located at. | |
const cudaDeviceProp & | GetDeviceProperty (const int device) |
Gets the device property for the given device. More... | |
void | DeviceQuery (const int deviceid) |
Runs a device query function and prints out the results to LOG(INFO). | |
bool | GetCudaPeerAccessPattern (vector< vector< bool > > *pattern) |
Return a peer access pattern by returning a matrix (in the format of a nested vector) of boolean values specifying whether peer access is possible. More... | |
bool | TensorCoreAvailable () |
Return the availability of TensorCores for math. | |
const char * | cublasGetErrorString (cublasStatus_t error) |
Return a human readable cublas error string. | |
const char * | curandGetErrorString (curandStatus_t error) |
Return a human readable curand error string. | |
int | CudaVersion () |
A runtime function to report the cuda version that Caffe2 is built with. | |
bool | HasCudaGPU () |
Check if the current running session has a cuda gpu present. More... | |
int | CAFFE_GET_BLOCKS (const int N) |
Compute the number of blocks needed to run N threads. | |
uint32_t | RandomNumberSeed () |
A function to generate a random number seed that is unique in a best-effort basis, using an ever-incrementing seed and the current time. | |
CudaMemoryPoolType | GetCudaMemoryPoolType () |
Gets the current memory pool type used by Caffe2. More... | |
CAFFE_KNOWN_TYPE (db::DBReader) | |
CAFFE_KNOWN_TYPE (db::Cursor) | |
void | EventCreateCPU (const DeviceOption &option, Event *event) |
void | EventRecordCPU (Event *event, const void *, const char *err_msg) |
void | EventFinishCPU (const Event *event) |
void | EventWaitCPUCPU (const Event *event, void *) |
EventStatus | EventQueryCPU (const Event *event) |
const std::string & | EventErrorMessageCPU (const Event *event) |
void | EventSetFinishedCPU (const Event *event, const char *err_msg) |
void | EventResetCPU (Event *event) |
REGISTER_EVENT_CREATE_FUNCTION (CPU, EventCreateCPU) | |
REGISTER_EVENT_RECORD_FUNCTION (CPU, EventRecordCPU) | |
REGISTER_EVENT_WAIT_FUNCTION (CPU, CPU, EventWaitCPUCPU) | |
REGISTER_EVENT_FINISH_FUNCTION (CPU, EventFinishCPU) | |
REGISTER_EVENT_QUERY_FUNCTION (CPU, EventQueryCPU) | |
REGISTER_EVENT_ERROR_MESSAGE_FUNCTION (CPU, EventErrorMessageCPU) | |
REGISTER_EVENT_SET_FINISHED_FUNCTION (CPU, EventSetFinishedCPU) | |
REGISTER_EVENT_RESET_FUNCTION (CPU, EventResetCPU) | |
bool | EventCanScheduleCPU (const Event *, const Event *) |
void | EventCreateCUDA (const DeviceOption &option, Event *event) |
void | EventRecordCUDA (Event *event, const void *context, const char *err_msg) |
void | EventFinishCUDA (const Event *event) |
void | EventWaitCUDACUDA (const Event *event, void *context) |
void | EventWaitCPUCUDA (const Event *event, void *context) |
void | EventWaitCUDACPU (const Event *event, void *context) |
EventStatus | EventQueryCUDA (const Event *event) |
const std::string & | EventErrorMessageCUDA (const Event *event) |
void | EventSetFinishedCUDA (const Event *event, const char *err_msg) |
void | EventResetCUDA (Event *event) |
REGISTER_EVENT_CREATE_FUNCTION (CUDA, EventCreateCUDA) | |
REGISTER_EVENT_RECORD_FUNCTION (CUDA, EventRecordCUDA) | |
REGISTER_EVENT_WAIT_FUNCTION (CUDA, CUDA, EventWaitCUDACUDA) | |
REGISTER_EVENT_WAIT_FUNCTION (CPU, CUDA, EventWaitCPUCUDA) | |
REGISTER_EVENT_WAIT_FUNCTION (CUDA, CPU, EventWaitCUDACPU) | |
REGISTER_EVENT_FINISH_FUNCTION (CUDA, EventFinishCUDA) | |
REGISTER_EVENT_QUERY_FUNCTION (CUDA, EventQueryCUDA) | |
REGISTER_EVENT_ERROR_MESSAGE_FUNCTION (CUDA, EventErrorMessageCUDA) | |
REGISTER_EVENT_SET_FINISHED_FUNCTION (CUDA, EventSetFinishedCUDA) | |
REGISTER_EVENT_RESET_FUNCTION (CUDA, EventResetCUDA) | |
REGISTER_EVENT_WAIT_FUNCTION (MKLDNN, CUDA, EventWaitCPUCUDA) | |
REGISTER_EVENT_WAIT_FUNCTION (CUDA, MKLDNN, EventWaitCUDACPU) | |
CAFFE_DEFINE_REGISTRY (Caffe2FlagsRegistry, Caffe2FlagParser, const string &) | |
void | SetUsageMessage (const string &str) |
Sets the usage message when a commandline tool is called with "--help". | |
const char * | UsageMessage () |
Returns the usage message for the commandline tool set by SetUsageMessage. | |
bool | ParseCaffeCommandLineFlags (int *pargc, char ***pargv) |
Parses the commandline flags. More... | |
bool | CommandLineFlagsHasBeenParsed () |
Checks if the commandline flags has already been passed. | |
CAFFE_DECLARE_REGISTRY (Caffe2FlagsRegistry, Caffe2FlagParser, const string &) | |
OperatorDef * | AddOp (NetDef *netdef_ptr, string op_type, std::vector< string > inputs, std::vector< string > outputs) |
bool | MatchStrings (string p, string s) |
This allows for the use of * and | to match operator types, engines, or any other property that is represented by strings. More... | |
bool | MatchArguments (const OperatorDef &p_op, const OperatorDef &g_op) |
This ensures that each named arg that exists in the pattern exists in g_op, is equal in value. | |
bool | GlobalInit (int *pargc, char ***argv) |
Initialize the global environment of caffe2. More... | |
bool | Caffe2CheckIntrinsicsFeatures (int *, char ***) |
REGISTER_CAFFE2_INIT_FUNCTION (Caffe2CheckIntrinsicsFeatures,&Caffe2CheckIntrinsicsFeatures,"Check intrinsics compatibility between the CPU feature and the binary.") | |
std::string | StripBasename (const std::string &full_path) |
size_t | ReplaceAll (string &s, const char *from, const char *to) |
void | SetStackTraceFetcher (std::function< string(void)> fetcher) |
void | SetOperatorLogger (std::function< void(const OperatorDef &)> tracer) |
std::function< void(const OperatorDef &)> | GetOperatorLogger () |
bool | InitCaffeLogging (int *argc, char **argv) |
void | ShowLogInfoToStderr () |
A utility to allow one to show log info to stderr after the program starts. More... | |
constexpr bool | IsUsingGoogleLogging () |
void | MakeStringInternal (std::stringstream &) |
template<typename T > | |
void | MakeStringInternal (std::stringstream &ss, const T &t) |
template<typename T , typename... Args> | |
void | MakeStringInternal (std::stringstream &ss, const T &t, const Args &...args) |
template<typename... Args> | |
string | MakeString (const Args &...args) |
template<> | |
string | MakeString (const string &str) |
string | MakeString (const char *c_str) |
template<class Container > | |
string | Join (const string &delimiter, const Container &v) |
template<class T > | |
void | LogMessageFatal (const char *file, int line, const T &message) |
template<typename T > | |
T & | CheckNotNullCommon (const char *file, int line, const char *names, T &t) |
template<typename T > | |
T * | CheckNotNull (const char *file, int line, const char *names, T *t) |
template<typename T > | |
T & | CheckNotNull (const char *file, int line, const char *names, T &t) |
template<class First , class Second > | |
std::ostream & | operator<< (std::ostream &out, const std::pair< First, Second > &p) |
template<class Iter > | |
void | PrintSequence (std::ostream &ss, Iter begin, Iter end) |
const CaffeMap< string, const ModuleSchema * > & | CurrentModules () |
Current Modules present in the Caffe2 runtime. More... | |
bool | HasModule (const string &name) |
Checks whether a module is already present in the current binary. | |
void | LoadModule (const string &name, const string &filename="") |
Load a module. More... | |
CAFFE_DEFINE_REGISTRY (NetRegistry, NetBase, const std::shared_ptr< const NetDef > &, Workspace *) | |
void | AddGlobalNetObserverCreator (NetObserverCreator creator) |
unique_ptr< NetBase > | CreateNet (const NetDef &net_def, Workspace *ws) |
Creates a network, accessing / creating blobs in the given workspace. More... | |
unique_ptr< NetBase > | CreateNet (const std::shared_ptr< const NetDef > &net_def, Workspace *ws) |
CAFFE_DECLARE_REGISTRY (NetRegistry, NetBase, const std::shared_ptr< const NetDef > &, Workspace *) | |
CAFFE_DEFINE_SHARED_REGISTRY (ThreadPoolRegistry, TaskThreadPool, const DeviceOption &) | |
CAFFE_REGISTER_CREATOR (ThreadPoolRegistry, CPU, AsyncNetCPUThreadPoolCreator) | |
std::shared_ptr< TaskThreadPool > | GetAsyncNetCPUThreadPool (int numa_node_id) |
CAFFE_DECLARE_SHARED_REGISTRY (ThreadPoolRegistry, TaskThreadPool, const DeviceOption &) | |
REGISTER_NET (async_dag, AsyncDAGNet) | |
std::shared_ptr< TaskThreadPool > | GetAsyncNetGPUThreadPool (int gpu_id) |
CAFFE_REGISTER_CREATOR (ThreadPoolRegistry, CUDA, AsyncNetGPUThreadPoolCreator) | |
REGISTER_NET (async_polling, AsyncPollingNet) | |
REGISTER_NET (async_scheduling, AsyncSchedulingNet) | |
REGISTER_NET (dag, DAGNet) | |
REGISTER_NET (simple, SimpleNet) | |
REGISTER_NET (async_simple, AsyncSimpleNet) | |
bool | IsNUMAEnabled () |
void | NUMABind (int numa_node_id) |
int | GetNUMANode (const void *ptr) |
int | GetNumNUMANodes () |
void | NUMAMove (void *ptr, size_t size, int numa_node_id) |
int | GetCurrentNUMANode () |
const std::string | OpRegistryKey (const std::string &op_type, const std::string &engine) |
void | SetPerOpEnginePref (const PerOpEnginePrefType &per_op_engine_pref) |
void | SetGlobalEnginePref (const GlobalEnginePrefType &global_engine_pref) |
void | SetEnginePref (const PerOpEnginePrefType &per_op_engine_pref, const GlobalEnginePrefType &global_engine_pref) |
void | SetOpEnginePref (const std::string &op_type, const CaffeMap< int, EnginePrefType > &op_pref) |
unique_ptr< OperatorBase > | CreateOperator (const OperatorDef &operator_def, Workspace *ws, int net_position) |
std::map< int32_t, OperatorRegistry * > * | gDeviceTypeRegistry () |
CAFFE_DEFINE_REGISTRY (CPUOperatorRegistry, OperatorBase, const OperatorDef &, Workspace *) | |
CAFFE_REGISTER_DEVICE_TYPE (DeviceType::CPU, CPUOperatorRegistry) | |
CAFFE_DEFINE_REGISTRY (CUDAOperatorRegistry, OperatorBase, const OperatorDef &, Workspace *) | |
CAFFE_REGISTER_DEVICE_TYPE (DeviceType::CUDA, CUDAOperatorRegistry) | |
CAFFE_DEFINE_REGISTRY (GradientRegistry, GradientMakerBase, const OperatorDef &, const vector< GradientWrapper > &) | |
GradientOpsMeta | GetGradientForOp (const OperatorDef &def, const vector< GradientWrapper > &g_output) |
Gets the GradientOpsMeta for the given operator def. | |
TensorShape | GetTensorShapeOfBlob (const Blob *b) |
TensorShapes | InferBlobShapesAndTypesFromWorkspace (Workspace *ws, const vector< std::unique_ptr< NetDef >> &nets) |
TensorShapes | InferBlobShapesAndTypesFromMap (const CaffeMap< std::string, std::vector< TIndex >> &blob_dimensions, const vector< std::unique_ptr< NetDef >> &nets) |
std::map< string, std::pair< DeviceOption, DeviceOption > > | ValidateTensorDevices (OperatorBase &op, const OperatorDef &op_def) |
std::set< std::string > | GetRegisteredOperators () |
CAFFE2_DEFINE_TENSOR_TYPES_DISPATCHER (TensorTypes, DoRunWithType, DoRunWithOtherType) CAFFE2_DEFINE_TENSOR_TYPES_DISPATCHER(TensorTypes2 | |
CAFFE_DECLARE_REGISTRY (CPUOperatorRegistry, OperatorBase, const OperatorDef &, Workspace *) | |
CAFFE_DECLARE_REGISTRY (CUDAOperatorRegistry, OperatorBase, const OperatorDef &, Workspace *) | |
CAFFE_DECLARE_REGISTRY (GradientRegistry, GradientMakerBase, const OperatorDef &, const vector< GradientWrapper > &) | |
std::ostream & | operator<< (std::ostream &out, const OpSchema &schema) |
template<typename T_I = int> | |
TensorShape | CreateTensorShape (vector< T_I > dims,::caffe2::TensorProto_DataType dt) |
vector< TIndex > | GetDimsVector (const TensorShape &shape) |
std::pair< std::vector< DeviceOption >, std::vector< DeviceOption > > | InferOpInputOutputDevice (const OperatorDef &op) |
template<uint64_t OpsPerPoint> | |
OpSchema::Cost | PointwiseCostInference (const OperatorDef &, const vector< TensorShape > &inputs) |
bool | RunPlanOnWorkspace (Workspace *ws, const PlanDef &plan, ShouldContinue shouldContinue) |
CAFFE_KNOWN_TYPE (QTensor< CPUContext >) | |
template<typename KeyType > | |
void | PrintOffendingKey (const KeyType &key) |
template<> | |
void | PrintOffendingKey (const string &key) |
template<typename F > | |
detail::ScopeGuardImplDecay< F > | MakeGuard (F &&f) noexcept(noexcept(detail::ScopeGuardImplDecay< F >(static_cast< F && >(f)))) |
ScopeGuard is a general implementation of the "Initialization is
Resource Acquisition" idiom. More... | |
ExportedStatMap | toMap (const ExportedStatList &stats) |
CAFFE_KNOWN_TYPE (Tensor< CPUContext >) | |
TypeCall | GetTypeCallFunction (CaffeTypeId id) |
void | RegisterTypeCallFunction (CaffeTypeId id, TypeCall c) |
TensorInfoCall | GetTensorInfoFunction (CaffeTypeId id) |
void | RegisterTensorInfoFunction (CaffeTypeId id, TensorInfoCall c) |
vector< TIndex > | ToVectorTIndex (const std::vector< int > &src) |
A utility function to convert vector<int> to vector<TIndex>. | |
TIndex | size_from_dim_ (int k, const vector< TIndex > &dims) |
Return product of all dimensions starting from K. | |
TIndex | size_to_dim_ (int k, const vector< TIndex > &dims) |
TIndex | size_between_dim_ (int k, int l, const vector< TIndex > &dims) |
int | canonical_axis_index_ (int axis_index, int ndims) |
template<class Context > | |
TypeMeta | GetTensorType (const void *c) |
template<class Context > | |
vector< TIndex > | GetTensorInfo (const void *c, bool *shares_data, size_t *capacity, DeviceOption *device) |
CAFFE_DEFINE_REGISTRY (TransformRegistry, Transform) | |
unique_ptr< Transform > | CreateTransform (string key) |
NetDef | ApplyTransform (const string &key, const NetDef &netdef) |
double | average_net_run_duration (const NetDef &netdef, const NetDef &init_netdef, const int warmup_runs, const int main_runs) |
NetDef | ApplyTransformIfFaster (const string &key, const NetDef &netdef, const NetDef &init_netdef, const int warmup_runs, const int main_runs, const double improvement_threshold) |
CAFFE_DECLARE_REGISTRY (TransformRegistry, Transform) | |
std::map< CaffeTypeId, string > & | gTypeNames () |
std::set< string > & | gRegisteredTypeNames () |
std::mutex & | gCaffe2TypeRegistrationMutex () |
string | Demangle (const char *name) |
string | GetExceptionString (const std::exception &e) |
CAFFE_KNOWN_TYPE (float) | |
CAFFE_KNOWN_TYPE (int) | |
CAFFE_KNOWN_TYPE (std::string) | |
CAFFE_KNOWN_TYPE (bool) | |
CAFFE_KNOWN_TYPE (uint8_t) | |
CAFFE_KNOWN_TYPE (int8_t) | |
CAFFE_KNOWN_TYPE (uint16_t) | |
CAFFE_KNOWN_TYPE (int16_t) | |
CAFFE_KNOWN_TYPE (int64_t) | |
CAFFE_KNOWN_TYPE (float16) | |
CAFFE_KNOWN_TYPE (double) | |
CAFFE_KNOWN_TYPE (char) | |
CAFFE_KNOWN_TYPE (std::unique_ptr< std::mutex >) | |
CAFFE_KNOWN_TYPE (std::unique_ptr< std::atomic< bool >>) | |
CAFFE_KNOWN_TYPE (std::vector< int64_t >) | |
CAFFE_KNOWN_TYPE (std::vector< unsigned long >) | |
CAFFE_KNOWN_TYPE (bool *) | |
CAFFE_KNOWN_TYPE (char *) | |
CAFFE_KNOWN_TYPE (int *) | |
TensorProto::DataType | TypeMetaToDataType (const TypeMeta &meta) |
const TypeMeta & | DataTypeToTypeMeta (const TensorProto::DataType &dt) |
StorageOrder | StringToStorageOrder (const string &str) |
constexpr char | NameScopeSeparator () |
struct | CAFFE2_ALIGNED (2) __f16 |
template<typename T > | |
bool | fp16_type () |
template<> | |
bool | fp16_type< float16 > () |
std::string | GetUniqueName () |
REGISTER_CPU_OPERATOR (CreateDB, CreateDBOp< CPUContext >) | |
OPERATOR_SCHEMA (CreateDB).NumInputs(0).NumOutputs(1) | |
NO_GRADIENT (CreateDB) | |
REGISTER_CUDA_OPERATOR (CreateDB, CreateDBOp< CUDAContext >) | |
REGISTER_CPU_OPERATOR (FileStoreHandlerCreate, FileStoreHandlerCreateOp< CPUContext >) | |
NumInputs (0).NumOutputs(1).SetDoc(R"DOC( Creates a unique_ptr<StoreHandler> that uses the filesystem as backing store (typically a filesystem shared between many nodes | |
such as NFS This store handler is not built to be fast Its recommended use is for integration tests and prototypes where extra dependencies are cumbersome Use an ephemeral path to ensure multiple processes or runs don t interfere DOC | Arg ("path","base path used by the FileStoreHandler").Arg("prefix" |
such as NFS This store handler is not built to be fast Its recommended use is for integration tests and prototypes where extra dependencies are cumbersome Use an ephemeral path to ensure multiple processes or runs don t interfere DOC prefix for all keys used by this store | Output (0,"handler","unique_ptr<StoreHandler>") |
NO_GRADIENT (FileStoreHandlerCreateOp) | |
REGISTER_CUDA_OPERATOR (FileStoreHandlerCreate, FileStoreHandlerCreateOp< CUDAContext >) | |
REGISTER_CPU_OPERATOR (RedisStoreHandlerCreate, RedisStoreHandlerCreateOp< CPUContext >) | |
host name of Redis server | Arg ("port","port number of Redis server").Arg("prefix" |
NO_GRADIENT (RedisStoreHandlerCreateOp) | |
REGISTER_CUDA_OPERATOR (RedisStoreHandlerCreate, RedisStoreHandlerCreateOp< CUDAContext >) | |
CAFFE_KNOWN_TYPE (std::unique_ptr< StoreHandler >) | |
REGISTER_CPU_OPERATOR (StoreSet, StoreSetOp) | |
NumInputs (2).NumOutputs(0).SetDoc(R"DOC( Set a blob in a store. The key is the input blob's name and the value is the data in that blob. The key can be overridden by specifying the 'blob_name' argument. )DOC").Arg("blob_name" = Ai * Bi | |
alternative key for the | blob (optional)") .Input(0 |
alternative key for the unique_ptr< StoreHandler > | Input (1,"data","data blob") |
REGISTER_CPU_OPERATOR (StoreGet, StoreGetOp) | |
NumInputs (1).NumOutputs(1).SetDoc(R"DOC( Get a blob from a store. The key is the output blob's name. The key can be overridden by specifying the 'blob_name' argument. )DOC").Arg("blob_name" | |
alternative key for the unique_ptr< StoreHandler > | Output (0,"data","data blob") |
REGISTER_CPU_OPERATOR (StoreAdd, StoreAddOp) | |
the store initializes it to and then performs the add operation The operation returns the resulting counter value DOC | Arg ("blob_name","key of the counter (required)").Arg("add_value" |
the store initializes it to and then performs the add operation The operation returns the resulting counter value DOC value that is | added (optional, default:1)") .Input(0 |
the store initializes it to and then performs the add operation The operation returns the resulting counter value DOC value that is unique_ptr< StoreHandler > | Output (0,"value","the current value of the counter") |
REGISTER_CPU_OPERATOR (StoreWait, StoreWaitOp) | |
NumInputs (1, 2).NumOutputs(0).SetDoc(R"DOC( Wait for the specified blob names to be set. The blob names can be passed either as an input blob with blob names or as an argument. )DOC").Arg("blob_names" | |
names of the blobs to wait | for (optional)") .Input(0 |
names of the blobs to wait unique_ptr< StoreHandler > | Input (1,"names","names of the blobs to wait for (optional)") |
REGISTER_CPU_OPERATOR (FC_Decomp, FullyConnectedOpDecomp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (FCGradient_Decomp, FullyConnectedDecompGradientOp< float, CPUContext >) | |
OPERATOR_SCHEMA (FC_Decomp).NumInputs(4).NumOutputs(1) | |
OPERATOR_SCHEMA (FCGradient_Decomp).NumInputs(4).NumOutputs(3 | |
REGISTER_GRADIENT (FC_Decomp, GetFCDecompGradient) | |
REGISTER_CUDA_OPERATOR (FC_Decomp, FullyConnectedOpDecomp< float, CUDAContext >) | |
REGISTER_CUDA_OPERATOR (FCGradient_Decomp, FullyConnectedDecompGradientOp< float, CUDAContext >) | |
REGISTER_CPU_OPERATOR (TTContraction, TTContractionOp< float, CPUContext >) | |
REGISTER_CUDA_OPERATOR (TTContraction, TTContractionOp< float, CUDAContext >) | |
REGISTER_CUDA_OPERATOR (TTContractionGradient, TTContractionGradientOp< float, CUDAContext >) | |
REGISTER_CPU_OPERATOR (ImageInput, ImageInputOp< CPUContext >) | |
NumInputs (0, 1).NumOutputs(2 | |
INT_MAX | TensorInferenceFunction ([](const OperatorDef &def, const vector< TensorShape > &){vector< TensorShape > out(2);ArgumentHelper helper(def);int batch_size=helper.GetSingleArgument< int >("batch_size", 0);int crop=helper.GetSingleArgument< int >("crop",-1);int color=helper.GetSingleArgument< int >("color", 1);CHECK_GT(crop, 0);out[0]=CreateTensorShape(vector< int >{batch_size, crop, crop, color?3:1}, TensorProto::FLOAT);out[1]=CreateTensorShape(vector< int >{1, batch_size}, TensorProto::INT32);return out;}).SetDoc(R"DOC( Imports and processes images from a database. For each run of the operator |
INT_MAX batch_size images will be processed GPUs can optionally be used for part of the processing The following transformations are applied to the image A bounding box is applied to the initial | image (optional)-The image is rescaled either up or down(with the scale argument) or just up(with the minsize argument)-The image is randomly cropped(crop size is passed as an argument but the location of the crop is random except if is_test is passed in which case the image in cropped at the center)-The image is normalized.Each of its color channels can have separate normalization values The dimension of the output image will always be cropxcrop) DOC") .Arg("batch_size" |
INT_MAX batch_size images will be processed GPUs can optionally be used for part of the processing The following transformations are applied to the image A bounding box is applied to the initial Number of images to output for each run of the | operator" ".Must be 1 or greater") .Arg ("color","Number of color channels (1 or 3). Defaults to 1").Arg("color_jitter" |
INT_MAX batch_size images will be processed GPUs can optionally be used for part of the processing The following transformations are applied to the image A bounding box is applied to the initial Number of images to output for each run of the Whether or not to do color jitter Defaults to | Arg ("img_saturation","Image saturation scale used in color jittering. ""Defaults to 0.4").Arg("img_brightness" |
INT_MAX batch_size images will be processed GPUs can optionally be used for part of the processing The following transformations are applied to the image A bounding box is applied to the initial Number of images to output for each run of the Whether or not to do color jitter Defaults to Image brightness scale used in color jittering Defaults to | Arg ("img_contrast","Image contrast scale used in color jittering. ""Defaults to 0.4").Arg("color_lighting" |
INT_MAX batch_size images will be processed GPUs can optionally be used for part of the processing The following transformations are applied to the image A bounding box is applied to the initial Number of images to output for each run of the Whether or not to do color jitter Defaults to Image brightness scale used in color jittering Defaults to Whether or not to do color lighting Defaults to | Arg ("color_lighting_std","Std of normal distribution where color lighting"" scaling factor is sampled. Defaults to 0.1").Arg("scale_jitter_type" |
INT_MAX batch_size images will be processed GPUs can optionally be used for part of the processing The following transformations are applied to the image A bounding box is applied to the initial Number of images to output for each run of the Whether or not to do color jitter Defaults to Image brightness scale used in color jittering Defaults to Whether or not to do color lighting Defaults to Scale the size of the smallest dimension of the image to this Scale and minsize are mutually exclusive Must be larger than crop | Arg ("minsize","Scale the size of the smallest dimension of the image to"" this only if the size is initially smaller. Scale and minsize are"" mutually exclusive. Must be larger than crop.").Arg("warp" |
the other dimension is proportionally scaled Defaults to | Arg ("crop","Size to crop the image to. Must be provided").Arg("mirror" |
the other dimension is proportionally scaled Defaults to Whether or not to mirror the image Defaults to | Arg ("mean","Mean by which to normalize color channels."" Defaults to 0.").Arg("mean_per_channel" |
the other dimension is proportionally scaled Defaults to Whether or not to mirror the image Defaults to Vector of means per color | channel (1 or 3 elements).Defaults to mean argument.Channel order BGR") .Arg("std" |
the other dimension is proportionally scaled Defaults to Whether or not to mirror the image Defaults to Vector of means per color Standard deviation by which to normalize color channels Defaults to | Arg ("std_per_channel","Vector of standard dev. per color channel "" (1 or 3 elements). Defaults to std argument. Channel order is BGR").Arg("bounding_ymin" |
the other dimension is proportionally scaled Defaults to Whether or not to mirror the image Defaults to Vector of means per color Standard deviation by which to normalize color channels Defaults to Bounding box coordinate Defaults | to (none)") .Arg("bounding_xmin" |
the other dimension is proportionally scaled Defaults to Whether or not to mirror the image Defaults to Vector of means per color Standard deviation by which to normalize color channels Defaults to Bounding box coordinate Defaults Bounding box coordinate Defaults Bounding box coordinate Defaults Bounding box coordinate Defaults if the input is in Caffe format Defaults to | Arg ("use_gpu_transform","1 if GPU acceleration should be used."" Defaults to 0. Can only be 1 in a CUDAContext").Arg("decode_threads" |
the other dimension is proportionally scaled Defaults to Whether or not to mirror the image Defaults to Vector of means per color Standard deviation by which to normalize color channels Defaults to Bounding box coordinate Defaults Bounding box coordinate Defaults Bounding box coordinate Defaults Bounding box coordinate Defaults if the input is in Caffe format Defaults to Number of CPU decode transform threads Defaults to | Arg ("output_type","If gpu_transform, can set to FLOAT or FLOAT16.").Arg("db" |
the other dimension is proportionally scaled Defaults to Whether or not to mirror the image Defaults to Vector of means per color Standard deviation by which to normalize color channels Defaults to Bounding box coordinate Defaults Bounding box coordinate Defaults Bounding box coordinate Defaults Bounding box coordinate Defaults if the input is in Caffe format Defaults to Number of CPU decode transform threads Defaults to Name of the | database (if not passed as input)") .Arg("db_type" |
the other dimension is proportionally scaled Defaults to Whether or not to mirror the image Defaults to Vector of means per color Standard deviation by which to normalize color channels Defaults to Bounding box coordinate Defaults Bounding box coordinate Defaults Bounding box coordinate Defaults Bounding box coordinate Defaults if the input is in Caffe format Defaults to Number of CPU decode transform threads Defaults to Name of the Type of The sizes of any outputs besides the data and | label (should have a number of elements equal to the number of additional" "outputs)") .Arg("random_scale" |
the other dimension is proportionally scaled Defaults to Whether or not to mirror the image Defaults to Vector of means per color Standard deviation by which to normalize color channels Defaults to Bounding box coordinate Defaults Bounding box coordinate Defaults Bounding box coordinate Defaults Bounding box coordinate Defaults if the input is in Caffe format Defaults to Number of CPU decode transform threads Defaults to Name of the Type of The sizes of any outputs besides the data and shortest side desired for image resize Defaults to[-1,-1] or no random resize desired | Input (0,"reader","The input reader (a db::DBReader)").Output(0 |
the other dimension is proportionally scaled Defaults to Whether or not to mirror the image Defaults to Vector of means per color Standard deviation by which to normalize color channels Defaults to Bounding box coordinate Defaults Bounding box coordinate Defaults Bounding box coordinate Defaults Bounding box coordinate Defaults if the input is in Caffe format Defaults to Number of CPU decode transform threads Defaults to Name of the Type of The sizes of any outputs besides the data and shortest side desired for image resize Defaults to[-1,-1] or no random resize desired Tensor containing the images | Output (1,"label","Tensor containing the labels").Output(2 |
NO_GRADIENT (ImageInput) | |
template<class Context > | |
bool | RandomSizedCropping (cv::Mat *img, const int crop, std::mt19937 *randgen) |
template<class Context > | |
void | Saturation (float *img, const int img_size, const float alpha_rand, std::mt19937 *randgen) |
template<class Context > | |
void | Brightness (float *img, const int img_size, const float alpha_rand, std::mt19937 *randgen) |
template<class Context > | |
void | Contrast (float *img, const int img_size, const float alpha_rand, std::mt19937 *randgen) |
template<class Context > | |
void | ColorJitter (float *img, const int img_size, const float saturation, const float brightness, const float contrast, std::mt19937 *randgen) |
template<class Context > | |
void | ColorLighting (float *img, const int img_size, const float alpha_std, const std::vector< std::vector< float >> &eigvecs, const std::vector< float > &eigvals, std::mt19937 *randgen) |
template<class Context > | |
void | ColorNormalization (float *img, const int img_size, const int channels, const std::vector< float > &mean, const std::vector< float > &std) |
template<class Context > | |
void | TransformImage (const cv::Mat &scaled_img, const int channels, float *image_data, const bool color_jitter, const float saturation, const float brightness, const float contrast, const bool color_lighting, const float color_lighting_std, const std::vector< std::vector< float >> &color_lighting_eigvecs, const std::vector< float > &color_lighting_eigvals, const int crop, const bool mirror, const std::vector< float > &mean, const std::vector< float > &std, std::mt19937 *randgen, std::bernoulli_distribution *mirror_this_image, bool is_test=false) |
template<class Context > | |
void | CropTransposeImage (const cv::Mat &scaled_img, const int channels, uint8_t *cropped_data, const int crop, const bool mirror, std::mt19937 *randgen, std::bernoulli_distribution *mirror_this_image, bool is_test=false) |
REGISTER_CUDA_OPERATOR (ImageInput, ImageInputOp< CUDAContext >) | |
template<typename T_IN , typename T_OUT , class Context > | |
bool | TransformOnGPU (Tensor< Context > &X, Tensor< Context > *Y, Tensor< Context > &mean, Tensor< Context > &std, Context *context) |
REGISTER_EVENT_CREATE_FUNCTION (MKLDNN, EventCreateCPU) | |
REGISTER_EVENT_RECORD_FUNCTION (MKLDNN, EventRecordCPU) | |
REGISTER_EVENT_WAIT_FUNCTION (MKLDNN, MKLDNN, EventWaitCPUCPU) | |
REGISTER_EVENT_WAIT_FUNCTION (MKLDNN, CPU, EventWaitCPUCPU) | |
REGISTER_EVENT_WAIT_FUNCTION (CPU, MKLDNN, EventWaitCPUCPU) | |
REGISTER_EVENT_FINISH_FUNCTION (MKLDNN, EventFinishCPU) | |
REGISTER_EVENT_QUERY_FUNCTION (MKLDNN, EventQueryCPU) | |
REGISTER_EVENT_ERROR_MESSAGE_FUNCTION (MKLDNN, EventErrorMessageCPU) | |
REGISTER_EVENT_SET_FINISHED_FUNCTION (MKLDNN, EventSetFinishedCPU) | |
REGISTER_EVENT_RESET_FUNCTION (MKLDNN, EventResetCPU) | |
CAFFE_DECLARE_REGISTRY (MKLOperatorRegistry, OperatorBase, const OperatorDef &, Workspace *) | |
CAFFE_KNOWN_TYPE (GLTensor< GLfloat >) | |
CAFFE_KNOWN_TYPE (GLTensor< GLhalf >) | |
CAFFE_KNOWN_TYPE (GLTensor< half >) | |
CAFFE_KNOWN_TYPE (Tensor< GLContext >) | |
void | EventCreateOPENGL (const DeviceOption &, Event *) |
void | EventRecordOPENGL (Event *, const void *, const char *) |
void | EventWaitOPENGLOPENGL (const Event *, void *) |
void | EventFinishOPENGL (const Event *) |
void | EventResetOPENGL (Event *) |
REGISTER_EVENT_CREATE_FUNCTION (OPENGL, EventCreateOPENGL) | |
REGISTER_EVENT_RECORD_FUNCTION (OPENGL, EventRecordOPENGL) | |
REGISTER_EVENT_WAIT_FUNCTION (OPENGL, OPENGL, EventWaitOPENGLOPENGL) | |
REGISTER_EVENT_FINISH_FUNCTION (OPENGL, EventFinishOPENGL) | |
REGISTER_EVENT_RESET_FUNCTION (OPENGL, EventResetOPENGL) | |
template<typename T = half> | |
void | getTensorCPU (const GLTensor< T > &g_, TensorCPU &g) |
REGISTER_NET (opengl, GLNet) | |
CAFFE_DEFINE_REGISTRY (GLOperatorRegistry, OperatorBase, const OperatorDef &, Workspace *) | |
CAFFE_REGISTER_DEVICE_TYPE (DeviceType::OPENGL, GLOperatorRegistry) | |
CAFFE_DECLARE_REGISTRY (GLOperatorRegistry, OperatorBase, const OperatorDef &, Workspace *) | |
void | dumpDefForOpenGL (const NetDef &d) |
NetDef | rewritePredictNetForOpenGL (const NetDef &predictNet, bool runFusion, std::unordered_set< std::string > cpuOps) |
bool | tryConvertToOpenGL (const NetDef &predictNet, NetDef *glPredictNet, bool runFusion, std::unordered_set< std::string > cpuOps) |
REGISTER_GL_OPERATOR (Relu, GLReluOp< half >) | |
REGISTER_GL_OPERATOR (Sigmoid, GLSigmoidOp< DataType >) | |
REGISTER_GL_OPERATOR (Concat, GLConcatOp< DataType >) | |
REGISTER_GL_OPERATOR (Conv, GLConvOp< DataType >) | |
REGISTER_GL_OPERATOR (CopyFromGL, CopyFromGLOp< DataType >) | |
REGISTER_GL_OPERATOR (Sum, GLSumOp< DataType >) | |
REGISTER_GL_OPERATOR (Add, GLSumOp< DataType >) | |
REGISTER_GL_OPERATOR (FC, GLFullyConnectedOp< DataType >) | |
REGISTER_GL_OPERATOR (NormalizePlanarYUV, GLNormalizePlanarYUVOp< DataType >) | |
REGISTER_GL_OPERATOR (AveragePool, GLAveragePoolOp< DataType >) | |
REGISTER_GL_OPERATOR (MaxPool, GLMaxPoolOp< DataType >) | |
REGISTER_GL_OPERATOR (Reshape, GLReshapeOp< DataType >) | |
REGISTER_GL_OPERATOR (ResizeNearest, GLResizeNearestOp< DataType >) | |
REGISTER_GL_OPERATOR (Softmax, GLSoftmaxOp< DataType >) | |
REGISTER_GL_OPERATOR (SpatialBN, GLSpatialBNOp< DataType >) | |
void | benchmarkModel (std::string init_net_pb, std::string predict_net_pb, std::string input_name, std::vector< int > input_dims, std::string net_name="benchmark_net", std::unordered_set< std::string > cpu_ops=std::unordered_set< std::string >({})) |
template<typename T = float> | |
void | PopulateCPUBlob (Workspace *ws, bool random, std::string name, std::vector< int > dims, int val=1, int dist_shift=0, float variance=1) |
template<typename T = half> | |
void | compareNetResult (Workspace &ws, NetDef &cpu_net, NetDef &gpu_net, string cpu_blob="ref_Y", string gpu_blob="gpu_Y", double tol=0.01, bool relative=false) |
template<typename T = half> | |
void | compareNetResult4D (Workspace &ws, NetDef &cpu_net, NetDef &gpu_net, string cpu_blob="ref_Y", string gpu_blob="gpu_Y", double tol=0.05) |
bool | tryConvertToMPSCNN (const NetDef &initNet, const NetDef &predictNet, NetDef *mpscnnPredictNet) |
NetDef | annotateDefWithReadCounts (const NetDef &net) |
NetDef | rewriteForMetal (const NetDef &net) |
NetDef | runMPSCNNFusion (const NetDef &net) |
void | dumpDef (const NetDef &d) |
void | mpscnnRecordExecutionFinish () |
MPSCNNContext & | getMPSCNNContext () |
bool | tryConvertToMPSCNNIntermediateCopies (const NetDef &initNet, const NetDef &predictNet, NetDef *mpscnnPredictNet) |
NetDef | setSpecialArgs (const NetDef &def) |
void | testMPSCNN () |
void | compareModels (const NetDef &initNet, NetDef predictNet) |
void | verifyRewrite (const NetDef &initNet, const NetDef &net, std::vector< int > inputDims) |
CAFFE_KNOWN_TYPE (GLImage< float >) | |
CAFFE_KNOWN_TYPE (GLImage< uint8_t >) | |
CAFFE_KNOWN_TYPE (GLImageVector< float >) | |
CAFFE_KNOWN_TYPE (GLImageVector< uint8_t >) | |
template<class T > | |
void | shareInputGLImage (Workspace *ws, const std::string &name, GLImageVector< T > *input) |
template<class T > | |
const GLImageVector< T > * | extractOutputGLImage (Workspace *ws, const std::string &name) |
const NetDef | create_gl_run_net (const NetDef &init_net, const NetDef &run_net, bool use_texture_input) |
NetDef | rewritePredictNetForOpenGL (const NetDef &predictNet, bool useTextureInput, bool useTiling, bool runFusion) |
bool | tryConvertToOpenGL (const NetDef &initNet, const NetDef &predictNet, NetDef *glPredictNet, bool useTextureInput, bool useTiling, bool runFusion) |
REGISTER_CPU_OPERATOR (OpenGLAdd, OpenGLAddOp< float16_t >) | |
OPERATOR_SCHEMA (OpenGLAdd).NumInputs(2).NumOutputs(1) | |
REGISTER_CPU_OPERATOR (OpenGLConcat, OpenGLConcatOp< float16_t >) | |
OPERATOR_SCHEMA (OpenGLConcat).NumInputs(2 | |
NumOutputs (1, 2) | |
REGISTER_CPU_OPERATOR (OpenGLConv, OpenGLConvOp< float16_t, false, false >) | |
OPERATOR_SCHEMA (OpenGLConv).NumInputs(3).NumOutputs(1) | |
REGISTER_CPU_OPERATOR (OpenGLConvPRelu, OpenGLConvOp< float16_t, true, false >) | |
OPERATOR_SCHEMA (OpenGLConvPRelu).NumInputs(4).NumOutputs(1) | |
REGISTER_CPU_OPERATOR (OpenGLConvRelu, OpenGLConvOp< float16_t, false, true >) | |
OPERATOR_SCHEMA (OpenGLConvRelu).NumInputs(3).NumOutputs(1) | |
REGISTER_CPU_OPERATOR (OpenGLConvTranspose, OpenGLConvTransposeOp< float16_t, false, false >) | |
OPERATOR_SCHEMA (OpenGLConvTranspose).NumInputs(3).NumOutputs(1) | |
REGISTER_CPU_OPERATOR (OpenGLConvTransposePRelu, OpenGLConvTransposeOp< float16_t, true, false >) | |
OPERATOR_SCHEMA (OpenGLConvTransposePRelu).NumInputs(4).NumOutputs(1) | |
REGISTER_CPU_OPERATOR (OpenGLConvTransposeRelu, OpenGLConvTransposeOp< float16_t, false, true >) | |
OPERATOR_SCHEMA (OpenGLConvTransposeRelu).NumInputs(3).NumOutputs(1) | |
REGISTER_CPU_OPERATOR (CopyToOpenGL, CopyToOpenGLOp< float16_t >) | |
OPERATOR_SCHEMA (CopyToOpenGL).NumInputs(1).NumOutputs(1).AllowInplace( | |
REGISTER_CPU_OPERATOR (CopyFromOpenGL, CopyFromOpenGLOp< float16_t >) | |
OPERATOR_SCHEMA (CopyFromOpenGL).NumInputs(1).NumOutputs(1).AllowInplace( | |
REGISTER_CPU_OPERATOR (OpenGLInstanceNorm, OpenGLInstanceNormPReluOp< float16_t, false >) | |
OPERATOR_SCHEMA (OpenGLInstanceNorm).NumInputs(3 | |
NumOutputs (1, 3).AllowInplace( | |
REGISTER_CPU_OPERATOR (OpenGLInstanceNormPRelu, OpenGLInstanceNormPReluOp< float16_t, true >) | |
OPERATOR_SCHEMA (OpenGLInstanceNormPRelu).NumInputs(3 | |
REGISTER_CPU_OPERATOR (OpenGLMul, OpenGLMulOp< float16_t >) | |
REGISTER_CPU_OPERATOR (OpenGLNormalizePlanarYUV, GLNormPlanarYUVOp< float16_t >) | |
OPERATOR_SCHEMA (OpenGLNormalizePlanarYUV).NumInputs(3).NumOutputs(1) | |
REGISTER_CPU_OPERATOR (OpenGLPadImage, OpenGLPadImageOp< float16_t >) | |
OPERATOR_SCHEMA (OpenGLPadImage).NumInputs(1).NumOutputs(1) | |
REGISTER_CPU_OPERATOR (OpenGLPRelu, OpenGLPReluOp< float16_t, GLPRelu::PRelu >) | |
IdenticalTypeAndShape () | |
REGISTER_CPU_OPERATOR (OpenGLRelu, OpenGLPReluOp< float16_t, GLPRelu::Relu >) | |
REGISTER_CPU_OPERATOR (OpenGLResizeNearest, OpenGLResizeNearestOp< float16_t >) | |
OPERATOR_SCHEMA (OpenGLResizeNearest).NumInputs(1).NumOutputs(1) | |
REGISTER_CPU_OPERATOR (OpenGLSigmoid, OpenGLSigmoidOp< float16_t, Sigmoid >) | |
REGISTER_CPU_OPERATOR (OpenGLTanh, OpenGLSigmoidOp< float16_t, Tanh >) | |
REGISTER_CPU_OPERATOR (OpenGLTensorToTextureStylizerPreprocess, OpenGLTensorToTextureStylizerPreprocessOp) | |
OPERATOR_SCHEMA (OpenGLTensorToTextureStylizerPreprocess).NumInputs(2).NumOutputs(1) | |
REGISTER_CPU_OPERATOR (OpenGLTextureToTextureStylizerPreprocess, OpenGLTextureToTextureStylizerPreprocessOp< RGBA >) | |
OPERATOR_SCHEMA (OpenGLTextureToTextureStylizerPreprocess).NumInputs(2).NumOutputs(1) | |
REGISTER_CPU_OPERATOR (OpenGLTextureToTensorStylizerDeprocess, OpenGLTextureToTensorStylizerDeprocessOp) | |
OPERATOR_SCHEMA (OpenGLTextureToTensorStylizerDeprocess).NumInputs(2).NumOutputs(1) | |
REGISTER_CPU_OPERATOR (OpenGLTextureToTextureStylizerDeprocess, OpenGLTextureToTextureStylizerDeprocessOp< RGBA >) | |
OPERATOR_SCHEMA (OpenGLTextureToTextureStylizerDeprocess).NumInputs(2).NumOutputs(1) | |
REGISTER_CPU_OPERATOR (OpenGLSub, OpenGLSubOp< float16_t >) | |
OPERATOR_SCHEMA (OpenGLSub).NumInputs(2).NumOutputs(1) | |
void | testOpenGL () |
void | compareModelsForOpenGL (std::string name, const NetDef &initNet, NetDef predictNet, int width, int height, int channel, std::string input_type, std::string input_order) |
void | compareBatchedToTiledModels (std::string name, const NetDef &initNet, NetDef predictNet, int width, int height, int channel, std::string input_type, std::string input_order) |
int | runModelBenchmarks (caffe2::NetDef &init_net, caffe2::NetDef &predict_net, int warm_up_runs, int main_runs, int channel, int height, int width, std::string input_type, std::string input_order, std::string engine, bool run_individual=false, bool use_texture_input=false, bool use_tiling=false, bool run_fusion=true) |
std::string & | gSNPELocation () |
REGISTER_CPU_OPERATOR (SNPE, SNPEOp) | |
void | uniformQuantize2b1b (const TensorCPU &X, const std::vector< std::unique_ptr< TensorCPU >> &XQ, float offset, float inter_center_distance) |
void | qconv (const ConvArgs &args, const TensorCPU &X, const TensorCPU &W, const TensorCPU *b, TensorCPU *Y) |
void | qpad_zero (const ConvArgs &args, const TensorCPU &X, TensorCPU *Y) |
void | signQuantize (const TensorCPU &X, TensorCPU *XQ) |
void | filterNormalization11 (const TensorCPU &WQ, TensorCPU *WQN) |
void | filterNormalizationL1 (const TensorCPU &W, TensorCPU *WL1) |
void | qim2col (const ConvArgs &args, const TensorCPU &XQ, const TensorCPU &WQ, TensorCPU *XQcol) |
std::unique_ptr< QConvState > | create2b1bConvState (Workspace *ws, const TensorCPU &W, const TensorCPU *b) |
void | run2b1bConvGeneric (QConvState *state, const ConvArgs &args, const TensorCPU &X, TensorCPU *Y) |
void | run2b1bUnification (QConvState *state, size_t N, size_t C, const float *WQNVdata, const float *YQs0Vdata, const float *YQs1Vdata, size_t YQstride, float *Ydata, size_t Ystride, const float *bias) |
REGISTER_CPU_OPERATOR (QConv, QConvOp) | |
size_t | divRoundUp (size_t x, size_t d) |
bool | run2b1bConvNeon (QConvState *state, const ConvArgs &args, const TensorCPU &X, TensorCPU *Y) |
CAFFE_KNOWN_TYPE (MPICommonWorldWrapper) | |
std::mutex & | MPIMutex () |
MPI_Comm | GlobalMPIComm () |
Gets the global MPI communicator used by Caffe2. More... | |
void | SetGlobalMPIComm (MPI_Comm new_comm) |
Sets the global MPI communicator. More... | |
int | MPICommSize (MPI_Comm comm) |
A helper function to return the size of the given communicator. | |
int | MPICommRank (MPI_Comm comm) |
A helper function to return the rank of the given communicator. | |
void | MPISetupPeers (const int replicas, const string &role, const string &job_path) |
A function used to perform peer setup so one does not need to use mpirun / mpiexec to run the binary. More... | |
void | CheckInitializedMPI () |
REGISTER_CPU_OPERATOR (Abs, UnaryElementwiseOp< TensorTypes< float >, CPUContext, AbsCPUFunctor >) | |
REGISTER_CPU_OPERATOR (AbsGradient, BinaryElementwiseOp< TensorTypes< float >, CPUContext, WithoutBroadcast< AbsGradientCPUFunctor >>) | |
element wise DOC | Input (0,"input","Input tensor").Output(0 |
element wise DOC The absolute value of the input tensor computed element wise | InheritOnnxSchema ("Abs") |
OPERATOR_SCHEMA (AbsGradient).NumInputs(2).NumOutputs(1).IdenticalTypeAndShape() | |
REGISTER_GRADIENT (Abs, GetAbsGradient) | |
REGISTER_CPU_OPERATOR (Accumulate, AccumulateOp< float, CPUContext >) | |
we first initialize the output tensor to all and then do accumulation Any further calls to the | operator, given that no one else fiddles with the output in the interim, will do simple accumulations.Accumulation is done using Axpby operation as shown:Y=1 *X+gamma *Y where X is the input tensor, Y is the output tensor and gamma is the multiplier argument.) DOC") .Arg ("gamma","(float, default 1.0) Accumulation multiplier").Input(0 |
we first initialize the output tensor to all and then do accumulation Any further calls to the The input tensor that has to be accumulated to the output tensor If the output size is not the same as input the output tensor is first reshaped and initialized to and only accumulation is done | Output (0,"output","Accumulated output tensor") |
SHOULD_NOT_DO_GRADIENT (Accumulate) | |
REGISTER_CPU_OPERATOR (Accuracy, AccuracyOp< float, CPUContext >) | |
NumInputs(2).NumOutputs(1).ScalarType(TensorProto | SHOULD_NOT_DO_GRADIENT (Accuracy) |
REGISTER_CPU_OPERATOR (RowWiseArgMax, RowWiseArgMaxOp< CPUContext >) | |
this | operator returns a 2D (N X 1) output tensor with with the index of the maximum value in each row.If there are duplicate max values in a row the index of the first occurence is returned.) DOC") .Input(0 |
this N X D input tensor | Output (0,"Z","2D (N X 1) output tensor") |
NO_GRADIENT (RowWiseArgMax) | |
REGISTER_CPU_OPERATOR (ArgMax, ArgMaxOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (ArgMin, ArgMinOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (Assert, AssertOp< CPUContext >) | |
or long longs and checks if all values are true when coerced into a boolean In other for non bool types this asserts that all values in the tensor are non zero DOC | Arg ("error_msg","An error message to print when the assert fails.", false) |
REGISTER_CPU_OPERATOR (BatchGather, BatchGatherOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (BatchGatherGradient, BatchGatherGradientOp< CPUContext >) | |
output_dims | push_back (data_dims[0]) |
output_dims | insert (output_dims.end(), indices_dims.begin(), indices_dims.end()) |
output_dims | insert (output_dims.end(), data_dims.begin()+2, data_dims.end()) |
SetDoc (R"DOC( Batch gather operation, first dimension in DATA is the batch size. Given DATA tensor of rank r >= 2, and INDICES tensor of rank q >= 1, gather entries of the outer-most dimension of DATA indexed by INDICES, and concatenate them in an output tensor of rank (q - 1) + (r - 1). Example: DATA = [ [1.0, 1.2, 2.4, 4.5], [2.3, 3.4, 3.6, 2.3], [4.5, 5.7, 1.2, 4.5], ] INDICES = [ [0, 2], ] OUTPUT = [ [1.0, 2.4], [2.3, 3.6], [4.5, 1.2], ] )DOC").Input(0 | |
Tensor of rank of any rank q | Output (0,"OUTPUT","Tensor of rank (q - 1) + (r - 1).") |
OPERATOR_SCHEMA (BatchGatherGradient).NumInputs(3).NumOutputs(1) | |
REGISTER_GRADIENT (BatchGather, GetBatchGatherGradient) | |
REGISTER_CPU_OPERATOR (BatchMatMul, BatchMatMulOp< CPUContext >) | |
vector< TensorShape > | TensorInferenceForBatchMatMul (const OperatorDef &def, const vector< TensorShape > &in) |
OpSchema::Cost | CostInferenceForBatchMatMul (const OperatorDef &def, const vector< TensorShape > &in) |
where A has | shape (dim0, dim1,...M, K) |
where A has B has | shape (dim0, dim1,...K, N) |
REGISTER_CPU_OPERATOR (BatchSparseToDense, BatchSparseToDenseOp< float, CPUContext >) | |
NumInputs (3, 4).NumOutputs(1).SetDoc(R"DOC( Convert sparse matrix representation into dense matrix. A sparse matrix is represented by `lengths` vector | |
indices and values vector Each element in lengths | vector (lengths[`i`]) represents the number of indices in this batch(batch`i`).With in each batch |
REGISTER_CPU_OPERATOR (BooleanMask, BooleanMaskOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (BooleanMaskLengths, BooleanMaskLengthsOp< CPUContext >) | |
SetDoc (R"DOC( Given a data tensor and a 1D boolean mask tensor, returns a tensor containing only the elements corresponding to positions where the mask is true. )DOC").Input(0 | |
original data tensor | Input (1,"mask","A tensor of bools of same shape as `data`.").Output(0 |
original data tensor A tensor of same type as data | Output (1,"masked_indices","A tensor for indices.") |
return the segment lengths of a corresponding segmented tensor after BooleanMask is applied DOC | Input (0,"lengths","A 1D int32 tensor representing segment lengths.").Input(1 |
return the segment lengths of a corresponding segmented tensor after BooleanMask is applied DOC A bool tensor of values to keep | Output (0,"masked_lengths","Segment lengths of a masked tensor.") |
NO_GRADIENT (BooleanMaskLengths) | |
template<typename Functor > | |
void | MaskWithFunctor (size_t N, size_t M, int B, const float *in, Functor fn, float fill_val, float *out) |
template<typename Functor > | |
void | RepeatedMaskWithFunctor (size_t N, size_t M, int D, const float *in, Functor fn, float fill_val, float *out) |
REGISTER_CPU_OPERATOR (SequenceMask, SequenceMaskOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (BooleanUnmask, BooleanUnmaskOp< CPUContext >) | |
NumInputs ([](int n){return n > 0 &&n%2==0;}).NumOutputs(1).SetDoc(R"DOC( Given a series of mask and values | |
reconstruct values together according to masks A comprehensive False False True Reconstruct We Note that for all mask there must be at least one True If for a field there are multiple True we will accept the first value For False False False True DOC | Output (0,"unmasked_data","The final reconstructed unmasked data") |
REGISTER_CPU_OPERATOR (Cast, CastOp< CPUContext >) | |
out | push_back (in[0]) |
out[0] | set_data_type (cast::GetCastDataType(helper,"to")) |
SetDoc (R"DOC( The operator casts the elements of a given input tensor to a data type specified by the 'to' argument and returns an output tensor of the same size in the converted type. The 'to' argument must be one of the data types specified in the 'DataType' enum field in the TensorProto message. If the 'to' argument is not provided or is not one of the enumerated types in DataType, Caffe2 throws an Enforce error. NOTE: Casting to and from strings is not supported yet. )DOC").Arg("to" | |
The data type to which the elements of the input tensor are cast Strictly must be one of the types from DataType enum in TensorProto | Input (0,"input","Input tensor to be cast.").Output(0 |
REGISTER_GRADIENT (Cast, GetCastGradient) | |
REGISTER_CPU_OPERATOR (Ceil, CeilOp< float, CPUContext >) | |
SetDoc (R"DOC( Ceil takes one input data (Tensor<T>) and produces one output data (Tensor<T>) where the ceil function, y = ceil(x), is applied to the tensor elementwise. Currently supports only float32. )DOC").Input(0 | |
ND input tensor | Output (0,"Y","ND input tensor") |
GRADIENT_NOT_IMPLEMENTED_YET (Ceil) | |
REGISTER_CPU_OPERATOR (ChannelBackpropStats, ChannelBackpropStatsOp< CPUContext >) | |
NumInputs (4).NumOutputs(2).SetDoc(R"DOC( Given an input tensor in NCHW format | |
the gradient for the output of SpatialBN and the per channel mean and inverse std var vectors for the computes the per channel bias and scale gradient to be used during the backward pass for subsequent spatial batch normalization gradient calculation the results of this op are subsequently reduced over multiple devices to obtain statistics over a larger batch size in cases where the batch size for a single model copy is too low to yield the full benefit of batch normalization The resulting bias and scale can then be plugged back into SpatialBNGradient to get results over the larger batch size DOC | Input (0,"X","The input 4-dimensional tensor of shape NCHW").Input(1 |
the gradient for the output of SpatialBN and the per channel mean and inverse std var vectors for the computes the per channel bias and scale gradient to be used during the backward pass for subsequent spatial batch normalization gradient calculation the results of this op are subsequently reduced over multiple devices to obtain statistics over a larger batch size in cases where the batch size for a single model copy is too low to yield the full benefit of batch normalization The resulting bias and scale can then be plugged back into SpatialBNGradient to get results over the larger batch size DOC The mean saved from the forward pass as a dimensional tensor of size C | Input (2,"inv_std","The saved inverse standard deviation as a 1-dimensional tensor ""of size C.").Input(3 |
the gradient for the output of SpatialBN and the per channel mean and inverse std var vectors for the computes the per channel bias and scale gradient to be used during the backward pass for subsequent spatial batch normalization gradient calculation the results of this op are subsequently reduced over multiple devices to obtain statistics over a larger batch size in cases where the batch size for a single model copy is too low to yield the full benefit of batch normalization The resulting bias and scale can then be plugged back into SpatialBNGradient to get results over the larger batch size DOC The mean saved from the forward pass as a dimensional tensor of size C Gradient for the output layer of here used as input because we are on the backward pass | Output (0,"scale_grad","Gradient for the scale vector").Output(1 |
SHOULD_NOT_DO_GRADIENT (ChannelBackpropStats) | |
REGISTER_CPU_OPERATOR (ChannelShuffle, ChannelShuffleOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (ChannelShuffleGradient, ChannelShuffleGradientOp< CPUContext >) | |
REGISTER_GRADIENT (ChannelShuffle, GetChannelShuffleGradient) | |
REGISTER_CPU_OPERATOR (ChannelStats, ChannelStatsOp< CPUContext >) | |
computes the sum of all elements per channel and the sum of all elements squared per channel These values can be reduced across multiple batches and used to obtain the mean and variance across the full set of batches Using the new mean and variance as input to SpatialBN has the effect of changing the batch size over which SpatialBN is applied DOC The output dimensional tensor of size C containing the sum of elements of X per channel | Output (1,"sumsq","The output 1-dimensional tensor of size C containing the sum of ""elements squared per channel.") |
SHOULD_NOT_DO_GRADIENT (ChannelStats) | |
REGISTER_CPU_OPERATOR (Clip, ClipOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (ClipGradient, ClipGradientOp< float, CPUContext >) | |
Key value handler for | rendezvous (optional).") .Output(0 |
Key value handler for A common world for collective operations | Arg ("size","(int) size of the common world.").Arg("rank" |
Existing common world to clone | Output (0,"comm_world","A common world for collective operations.") |
SetDoc ("Closes all connections managed by a common world.").Input(0 | |
NumInputsOutputs ([](int in, int out){return in >=2 &&out==(in-1);}).EnforceInplace([](int in | |
InputsCanCrossDevices ().IdenticalTypeAndShapeOfInput(0).SetDoc(R"DOC( Does a broadcast operation from the root node to every other node. The tensor on each node should have been pre-created with the same shape and data type. )DOC").Input(0 | |
The common world | Input (1,"X","A tensor to be broadcasted.").Output(0 |
The common world In place as input | Arg ("root","(int, default 0) the root to run broadcast from.") |
The common world | Input (1,"X","A tensor to be reduced.").Output(0 |
The common world The reduced result on not set for other nodes | Arg ("root","(int, default 0) the root to run reduce into.") |
IdenticalTypeAndShapeOfInput (0).InputsCanCrossDevices().SetDoc(R"DOC( Does an allreduce operation among the nodes. Currently only Sum is supported. )DOC").Input(0 | |
The common world | Input (1,"X","A tensor to be allreduced.").Output(0 |
The common world | Input (1,"X","A tensor to be reduce-scattered.").Output(0 |
NumInputs (2, INT_MAX).NumOutputs(1).InputsCanCrossDevices().SetDoc(R"DOC( Does an allgather operation among the nodes. )DOC").Input(0 | |
The common world | Input (1,"X","A tensor to be allgathered.").Output(0 |
NumInputs ({2, 4}).NumOutputs(0).SetDoc(R"DOC( Sends the tensor to another node. )DOC").Input(0 | |
The common world An int CPUtensor of size specifying the rank If this overrides the to argument of the op | Input (3,"tag","An int CPUtensor of size 1 specifying the tag to ""send the tensor with. This overrides the 'tag' ""argument of the op.").Arg("dst" |
The common world An int CPUtensor of size specifying the rank If this overrides the to argument of the op The rank to send the tensor to | Arg ("tag","(int) a tag to send the tensor with.").Arg("raw_buffer" |
AllowInplace ({{2, 1},{3, 2}}).SetDoc(R"DOC( Receives the tensor from another node. )DOC").Input(0 | |
The common world | Input (1,"Y","In-place output. If raw_buffer is specified, ""Y should have pre-allocated data and type..").Input(2 |
The common world An int CPUtensor of size specifying the rank If this overrides the from argument of the op The received tensor | Output (1,"src","The sender that sent the message as a CPUTensor ""of size 1 and of type int.").Output(2 |
The common world An int CPUtensor of size specifying the rank If this overrides the from argument of the op The received tensor The tag that the message is sent with as a CPUTensor of size and of type int | Arg ("src","(int) he rank to receive the tensor from.").Arg("tag" |
The common world An int CPUtensor of size specifying the rank If this overrides the from argument of the op The received tensor The tag that the message is sent with as a CPUTensor of size and of type int int a tag to receive the tensor with | Arg ("raw_buffer","(bool) if set, only send the content and assume that the receiver ""has already known the tensor's shape and information.") |
SHOULD_NOT_DO_GRADIENT (CreateCommonWorld) | |
SHOULD_NOT_DO_GRADIENT (CloneCommonWorld) | |
SHOULD_NOT_DO_GRADIENT (DestroyCommonWorld) | |
SHOULD_NOT_DO_GRADIENT (Broadcast) | |
SHOULD_NOT_DO_GRADIENT (Reduce) | |
SHOULD_NOT_DO_GRADIENT (Allgather) | |
SHOULD_NOT_DO_GRADIENT (Allreduce) | |
SHOULD_NOT_DO_GRADIENT (ReduceScatter) | |
SHOULD_NOT_DO_GRADIENT (Barrier) | |
SHOULD_NOT_DO_GRADIENT (SendTensor) | |
SHOULD_NOT_DO_GRADIENT (ReceiveTensor) | |
REGISTER_CPU_OPERATOR (CreateCommonWorld, NoDefaultEngineOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (CloneCommonWorld, NoDefaultEngineOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (DestroyCommonWorld, NoDefaultEngineOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (Broadcast, NoDefaultEngineOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (Reduce, NoDefaultEngineOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (Allgather, NoDefaultEngineOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (Allreduce, NoDefaultEngineOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (ReduceScatter, NoDefaultEngineOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (Barrier, NoDefaultEngineOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (SendTensor, NoDefaultEngineOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (ReceiveTensor, NoDefaultEngineOp< CPUContext >) | |
REGISTER_CUDA_OPERATOR (CreateCommonWorld, NoDefaultEngineOp< CUDAContext >) | |
REGISTER_CUDA_OPERATOR (CloneCommonWorld, NoDefaultEngineOp< CUDAContext >) | |
REGISTER_CUDA_OPERATOR (Broadcast, NoDefaultEngineOp< CUDAContext >) | |
REGISTER_CUDA_OPERATOR (Reduce, NoDefaultEngineOp< CUDAContext >) | |
REGISTER_CUDA_OPERATOR (Allgather, NoDefaultEngineOp< CUDAContext >) | |
REGISTER_CUDA_OPERATOR (Allreduce, NoDefaultEngineOp< CUDAContext >) | |
REGISTER_CUDA_OPERATOR (SendTensor, NoDefaultEngineOp< CUDAContext >) | |
REGISTER_CUDA_OPERATOR (ReceiveTensor, NoDefaultEngineOp< CUDAContext >) | |
REGISTER_CPU_OPERATOR (Split, SplitOp< CPUContext >) | |
INT_MAX | Input (0,"input","The tensor to split").Input(1 |
INT_MAX Optional list of output | lengths (see also arg 'split')") .Arg("axis" |
INT_MAX Optional list of output Which axis to split on | Arg ("split","length of each output").Arg("order" |
INT_MAX Optional list of output Which axis to split on Either NHWC or will split on C defaults to NCHW | DeviceInferenceFunction (splitOpDevInfer).SetDoc(R"DOC( Split a tensor into a list of tensors |
REGISTER_CPU_OPERATOR (Concat, ConcatOp< CPUContext >) | |
NumInputs (1, INT_MAX).NumOutputs(2).Arg("axis" | |
Which axis to concat on | Arg ("order","Either NHWC or NCHW, will concat on C axis, defaults to NCHW").Arg("add_axis" |
Which axis to concat on Pass to add the axis specified in arg axis to all input tensors | TensorInferenceFunction ([](const OperatorDef &def, const vector< TensorShape > &in){ArgumentHelper helper(def);const int axis=helper.HasArgument("axis")?helper.GetSingleArgument< int >("axis",-1):GetDimFromOrderString(helper.GetSingleArgument< string >("order","NCHW"));bool add_axis=helper.GetSingleArgument< int >("add_axis", 0)!=0;const int canonical_axis=canonical_axis_index_(axis, in[0].dims_size());CAFFE_ENFORCE_GT(in.size(), 0);vector< int > split_shape(1, in.size());vector< int > out_shape(in[0].dims().begin(), in[0].dims().end());if(add_axis){for(int i=1;i< in.size();++i){CAFFE_ENFORCE_EQ(in[0].dims().size(), in[i].dims().size(),"All inputs of Concat should have same dims when add_axis = 1. ""Got different sizes for inputs 0 and ", i);for(int j=0;j< in[0].dims().size();++j){CAFFE_ENFORCE_EQ(in[0].dims(j), in[i].dims(j),"All inputs of Concat should have same dims when add_axis = 1. ""Got different dims for inputs 0 and ", i,". At dim: ", j);}}out_shape.insert(out_shape.begin()+canonical_axis, in.size());}else{for(int i=1;i< in.size();++i){CAFFE_ENFORCE_EQ(in[0].dims().size(), in[i].dims().size(),"All inputs of Concat should have same dims except ""canonical_axis dim that is equal to ", canonical_axis,"Got different sizes for inputs 0 and ", i);for(int j=0;j< in[0].dims().size();++j){if(j==canonical_axis){continue;}CAFFE_ENFORCE_EQ(in[0].dims(j), in[i].dims(j),"All inputs of Concat should have same dims except ""canonical_axis dim that is equal to ", canonical_axis,"Got different dims for inputs 0 and ", i,". At dim: ", j);}}for(int i=1;i< in.size();++i){out_shape[canonical_axis]+=in[i].dims(canonical_axis);}}if(def.output_size()==1){return vector< TensorShape >{CreateTensorShape(out_shape, in[0].data_type())};}return vector< TensorShape >{CreateTensorShape(out_shape, in[0].data_type()), CreateTensorShape(split_shape, TensorProto::INT32)};}).CostInferenceFunction(CostInferenceForConcat).DeviceInferenceFunction(concatOpDevInfer).SetDoc("Concatenate a list of tensors into a single tensor").Output(0 |
Which axis to concat on Pass to add the axis specified in arg axis to all input tensors Concatenated tensor | Output (1,"split_info","The dimensions of the inputs.").InheritOnnxSchema("Concat") |
REGISTER_CPU_OPERATOR (DepthSplit, SplitOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (DepthConcat, ConcatOp< CPUContext >) | |
INT_MAX | SetDoc ("Backward compatible operator name for Split.") |
REGISTER_GRADIENT (Split, GetSplitGradient) | |
REGISTER_GRADIENT (DepthSplit, GetSplitGradient) | |
REGISTER_GRADIENT (Concat, GetConcatGradient) | |
REGISTER_GRADIENT (DepthConcat, GetConcatGradient) | |
REGISTER_CUDA_OPERATOR (Split, SplitOp< CUDAContext >) | |
REGISTER_CUDA_OPERATOR (Concat, ConcatOp< CUDAContext >) | |
REGISTER_CUDA_OPERATOR (DepthSplit, SplitOp< CUDAContext >) | |
REGISTER_CUDA_OPERATOR (DepthConcat, ConcatOp< CUDAContext >) | |
REGISTER_CPU_OPERATOR (Conditional, ConditionalOp< CPUContext >) | |
NumInputs (3).NumOutputs(1).SetDoc(R"DOC( Given a 1-D tensor of boolean values | |
apply conditional | operator along the first dimension of DataT and DataF and return DataO.Note, DataT and DataF must have the exact same shape and type.) DOC") .Input (0,"Condition","Boolean tensor to select DataT or DataF").Input(1 |
apply conditional Data to use when True | Input (2,"DataF","Data to use when False").Output(0 |
NO_GRADIENT (Conditional) | |
REGISTER_CPU_OPERATOR (ConvGradient, ConvGradientOp< float, CPUContext >) | |
OPERATOR_SCHEMA (ConvGradient).NumInputs(2 | |
REGISTER_CPU_OPERATOR (Conv1DGradient, ConvGradientOp< float, CPUContext >) | |
OPERATOR_SCHEMA (Conv1DGradient).NumInputs(2 | |
REGISTER_CPU_OPERATOR (Conv2DGradient, ConvGradientOp< float, CPUContext >) | |
OPERATOR_SCHEMA (Conv2DGradient).NumInputs(2 | |
REGISTER_CPU_OPERATOR (Conv3DGradient, ConvGradientOp< float, CPUContext >) | |
OPERATOR_SCHEMA (Conv3DGradient).NumInputs(2 | |
REGISTER_GRADIENT (Conv, GetConvGradient) | |
REGISTER_GRADIENT (Conv1D, GetConvGradient) | |
REGISTER_GRADIENT (Conv2D, GetConvGradient) | |
REGISTER_GRADIENT (Conv3D, GetConvGradient) | |
std::function< void(OpSchema &)> | ConvDocGenerator (const char *dim) |
REGISTER_CPU_OPERATOR (Conv, ConvOp< float, CPUContext >) | |
NumInputs(2, 3).NumOutputs(1).TensorInferenceFunction(ConvPoolOpBase< CPUContext > | REGISTER_CPU_OPERATOR (Conv1D, ConvOp< float, CPUContext >) |
NumInputs(2, 3).NumOutputs(1).TensorInferenceFunction(ConvPoolOpBase< CPUContext > | REGISTER_CPU_OPERATOR (Conv2D, ConvOp< float, CPUContext >) |
NumInputs(2, 3).NumOutputs(1).CostInferenceFunction(OpSchema | REGISTER_CPU_OPERATOR (Conv3D, ConvOp< float, CPUContext >) |
REGISTER_CUDNN_OPERATOR (Conv, CudnnConvOp) | |
REGISTER_CUDNN_OPERATOR (ConvGradient, CudnnConvGradientOp) | |
REGISTER_CUDNN_OPERATOR (Conv1D, CudnnConvOp) | |
REGISTER_CUDNN_OPERATOR (Conv1DGradient, CudnnConvGradientOp) | |
REGISTER_CUDNN_OPERATOR (Conv2D, CudnnConvOp) | |
REGISTER_CUDNN_OPERATOR (Conv2DGradient, CudnnConvGradientOp) | |
REGISTER_CUDNN_OPERATOR (Conv3D, CudnnConvOp) | |
REGISTER_CUDNN_OPERATOR (Conv3DGradient, CudnnConvGradientOp) | |
REGISTER_CPU_OPERATOR_WITH_ENGINE (Conv, EIGEN, EigenConvOp< float >) | |
REGISTER_CPU_OPERATOR_WITH_ENGINE (Conv1D, EIGEN, EigenConvOp< float >) | |
REGISTER_CPU_OPERATOR_WITH_ENGINE (Conv2D, EIGEN, EigenConvOp< float >) | |
REGISTER_CPU_OPERATOR_WITH_ENGINE (Conv3D, EIGEN, EigenConvOp< float >) | |
REGISTER_CUDA_OPERATOR (Conv, ConvOp< float, CUDAContext >) | |
REGISTER_CUDA_OPERATOR (ConvGradient, ConvGradientOp< float, CUDAContext >) | |
REGISTER_CUDA_OPERATOR (Conv1D, ConvOp< float, CUDAContext >) | |
REGISTER_CUDA_OPERATOR (Conv1DGradient, ConvGradientOp< float, CUDAContext >) | |
REGISTER_CUDA_OPERATOR (Conv2D, ConvOp< float, CUDAContext >) | |
REGISTER_CUDA_OPERATOR (Conv2DGradient, ConvGradientOp< float, CUDAContext >) | |
REGISTER_CUDA_OPERATOR (Conv3D, ConvOp< float, CUDAContext >) | |
REGISTER_CUDA_OPERATOR (Conv3DGradient, ConvGradientOp< float, CUDAContext >) | |
template<> | |
void | createSharedBuffer< CPUContext > (Workspace *ws) |
template<> | |
void | runWithSharedBuffer (Workspace *ws, std::function< void(Tensor< CPUContext > *buffer)> f) |
template<typename Context > | |
void | createSharedBuffer (Workspace *ws) |
Creates a mutex and shared buffer in the workspace. More... | |
template<typename Context > | |
void | runWithSharedBuffer (Workspace *ws, std::function< void(Tensor< Context > *buffer)> f) |
Thread-safe, can be invoked from RunOnDevice() to serialize access to shared buffer. | |
template<> | |
void | createSharedBuffer< CUDAContext > (Workspace *ws) |
template<> | |
void | runWithSharedBuffer (Workspace *ws, std::function< void(Tensor< CUDAContext > *buffer)> f) |
REGISTER_CPU_OPERATOR (ConvTransposeGradient, ConvTransposeGradientOp< float, CPUContext >) | |
OPERATOR_SCHEMA (ConvTransposeGradient).NumInputs(3).NumOutputs(1 | |
REGISTER_GRADIENT (ConvTranspose, GetConvTransposeGradient) | |
REGISTER_CPU_OPERATOR (ConvTranspose, ConvTransposeOp< float, CPUContext >) | |
NumInputs (2, 3).NumOutputs(1).SetDoc(R"DOC( The transposed convolution consumes an input vector | |
this is done throughout the image data and the output is computed As a side note on the implementation which is why they are separate files DOC | Input (0,"X","Input data blob from previous layer; has size ""(N x C x H x W), where N is the batch size, C is the number of channels, and"" H and W are the height and width. Note that this is for the NCHW usage. On ""the other hand, the NHWC Op has a different set of dimension constraints.").Input(1 |
has | size (M x C x kH x kW) |
has where C is the number of and kH and kW are the height and width of the kernel | Input (2,"bias","The 1D bias blob that is added through the convolution;""has size (C). Optional, if not passed, will treat it as all 0.").Output(0 |
has where C is the number of and kH and kW are the height and width of the kernel Output data blob that contains the result of the transposed convolution The output dimensions are functions of the kernel stride and pad lengths | InheritOnnxSchema ("ConvTranspose") |
REGISTER_CUDNN_OPERATOR (ConvTranspose, CudnnConvTransposeOp< float >) | |
REGISTER_CUDNN_OPERATOR (ConvTransposeGradient, CudnnConvTransposeGradientOp< float >) | |
REGISTER_CUDA_OPERATOR (ConvTranspose, ConvTransposeOp< float, CUDAContext >) | |
REGISTER_CUDA_OPERATOR (ConvTransposeGradient, ConvTransposeGradientOp< float, CUDAContext >) | |
REGISTER_CPU_OPERATOR (Cos, UnaryElementwiseOp< TensorTypes< float >, CPUContext, CosCPUFunctor >) | |
REGISTER_CPU_OPERATOR (CosGradient, BinaryElementwiseOp< TensorTypes< float >, CPUContext, WithoutBroadcast< CosGradientCPUFunctor >>) | |
OPERATOR_SCHEMA (CosGradient).NumInputs(2).NumOutputs(1).IdenticalTypeAndShape() | |
REGISTER_GRADIENT (Cos, GetCosGradient) | |
REGISTER_CPU_OPERATOR (CosineEmbeddingCriterion, CosineEmbeddingCriterionOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (CosineEmbeddingCriterionGradient, CosineEmbeddingCriterionGradientOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (CreateCounter, CreateCounterOp< int64_t, CPUContext >) | |
REGISTER_CPU_OPERATOR (ResetCounter, ResetCounterOp< int64_t, CPUContext >) | |
REGISTER_CPU_OPERATOR (CountDown, CountDownOp< int64_t, CPUContext >) | |
REGISTER_CPU_OPERATOR (CheckCounterDone, CheckCounterDoneOp< int64_t, CPUContext >) | |
REGISTER_CPU_OPERATOR (CountUp, CountUpOp< int64_t, CPUContext >) | |
REGISTER_CPU_OPERATOR (RetrieveCount, RetrieveCountOp< int64_t, CPUContext >) | |
A blob pointing to an instance of a new counter | Arg ("init_count","Initial count for the counter, must be >= 0.") |
SetDoc (R"DOC( Resets a count-down counter with initial value specified by the 'init_count' argument. )DOC").Input(0 | |
A blob pointing to an instance of a new counter | Output (0,"previous_value","(optional) Previous value of the counter.").Arg("init_count" |
REGISTER_CUDA_OPERATOR (CreateCounter, CreateCounterOp< int64_t, CUDAContext >) | |
REGISTER_CUDA_OPERATOR (ResetCounter, ResetCounterOp< int64_t, CUDAContext >) | |
REGISTER_CUDA_OPERATOR (CountDown, CountDownOp< int64_t, CUDAContext >) | |
REGISTER_CUDA_OPERATOR (CheckCounterDone, CheckCounterDoneOp< int64_t, CUDAContext >) | |
REGISTER_CUDA_OPERATOR (CountUp, CountUpOp< int64_t, CUDAContext >) | |
REGISTER_CUDA_OPERATOR (RetrieveCount, RetrieveCountOp< int64_t, CUDAContext >) | |
CAFFE_KNOWN_TYPE (detail::WorkspaceStack) | |
REGISTER_CPU_OPERATOR (CreateScope, CreateScopeOp< CPUContext >) | |
SHOULD_NOT_DO_GRADIENT (CreateScope) | |
OPERATOR_SCHEMA (CreateScope).NumInputs(0).NumOutputs(1).SetDoc(R"DOC( 'CreateScope' operator initializes and outputs empty scope that is used by Do operator to store local blobs )DOC") | |
REGISTER_CPU_OPERATOR (HasScope, HasScopeOp< CPUContext >) | |
SHOULD_NOT_DO_GRADIENT (HasScope) | |
OPERATOR_SCHEMA (HasScope).NumInputs(1).NumOutputs(1).SetDoc(R"DOC( Checks whether scope blob has any saved scopes left )DOC") | |
REGISTER_CPU_OPERATOR (LabelCrossEntropy, LabelCrossEntropyOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (LabelCrossEntropyGradient, LabelCrossEntropyGradientOp< float, CPUContext >) | |
SetDoc (R"DOC( Operator computes the cross entropy between the input and the label set. In practice, it is most commonly used at the end of models, after the SoftMax operator and before the AveragedLoss operator. Note that LabelCrossEntropy assumes that the label provided is either a 1D array of size N (batch size), or a 2D array of size N x 1 (batch size). Each entry in the label vector indicates which is the correct class; as such, each entry must be between 0 and D - 1, inclusive, where D is the total number of classes. The formula used is: Y[i] = -log(X[i][j]) where (i, j) is the classifier's prediction of the jth class (the correct one), and i is the batch size. Each log has a lower limit for numerical stability. )DOC").Input(0 | |
X is a array of size N x where N is the batch size and D is the number of classes | Input (1,"label","Blob containing the labels used to compare the input").Output(0 |
REGISTER_GRADIENT (LabelCrossEntropy, GetLabelCrossEntropyGradient) | |
REGISTER_CPU_OPERATOR (MakeTwoClass, MakeTwoClassOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (MakeTwoClassGradient, MakeTwoClassGradientOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (SigmoidCrossEntropyWithLogits, SigmoidCrossEntropyWithLogitsOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (SigmoidCrossEntropyWithLogitsGradient, SigmoidCrossEntropyWithLogitsGradientOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (WeightedSigmoidCrossEntropyWithLogits, WeightedSigmoidCrossEntropyWithLogitsOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (WeightedSigmoidCrossEntropyWithLogitsGradient, WeightedSigmoidCrossEntropyWithLogitsGradientOp< float, CPUContext >) | |
out[0] | add_dims (in[0].dims(0)) |
out[0] | add_dims (2) |
SetDoc (R"DOC( Given a vector of probabilities, this operator transforms this into a 2-column matrix with complimentary probabilities for binary classification. In explicit terms, given the vector X, the output Y is vstack(1 - X, X). )DOC").Input(0 | |
Input vector of probabilities | Output (0,"Y","2-column matrix with complimentary probabilities of X for ""binary classification") |
SetDoc (R"DOC( Given two matrices logits and targets, of same shape, (batch_size, num_classes), computes the sigmoid cross entropy between the two. Returns a tensor of shape (batch_size,) of losses for each example. )DOC").Input(0 | |
matrix of logits for each example and class | Input (1,"targets","matrix of targets, same shape as logits.").Output(0 |
SetDoc (R"DOC( Given three matrices: logits, targets, weights, all of the same shape, (batch_size, num_classes), computes the weighted sigmoid cross entropy between logits and targets. Specifically, at each position r,c, this computes weights[r, c] * crossentropy(sigmoid(logits[r, c]), targets[r, c]), and then averages over each row. Returns a tensor of shape (batch_size,) of losses for each example. )DOC").Input(0 | |
matrix of logits for each example and class matrix of same shape as logits | Output (0,"xentropy","Vector with the total xentropy for each example.") |
REGISTER_GRADIENT (MakeTwoClass, GetMakeTwoClassGradient) | |
REGISTER_GRADIENT (SigmoidCrossEntropyWithLogits, GetSigmoidCrossEntropyWithLogitsGradient) | |
REGISTER_GRADIENT (WeightedSigmoidCrossEntropyWithLogits, GetWeightedSigmoidCrossEntropyWithLogitsGradient) | |
REGISTER_CPU_OPERATOR (CrossEntropy, CrossEntropyOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (CrossEntropyGradient, CrossEntropyGradientOp< float, CPUContext >) | |
SetDoc (R"DOC( Operator computes the cross entropy between the input and the label set. In practice, it is most commonly used at the end of models, after the SoftMax operator and before the AveragedLoss operator. Note that CrossEntropy assumes that the soft labels provided is a 2D array of size N x D (batch size x number of classes). Each entry in the 2D label corresponds to the soft label for the input, where each element represents the correct probability of the class being selected. As such, each element must be between 0 and 1, and all elements in an entry must sum to 1. The formula used is: Y[i] = sum_j (label[i][j] * log(X[i][j])) where (i, j) is the classifier's prediction of the jth class (the correct one), and i is the batch size. Each log has a lower limit for numerical stability. )DOC").Input(0 | |
REGISTER_GRADIENT (CrossEntropy, GetCrossEntropyGradient) | |
CAFFE_KNOWN_TYPE (std::unique_ptr< dataset_ops::TreeCursor >) | |
CAFFE_KNOWN_TYPE (dataset_ops::TensorVectorPtr< CPUContext >) | |
CAFFE_KNOWN_TYPE (dataset_ops::SharedTensorVectorPtr) | |
OPERATOR_SCHEMA (DeformConvGradient).NumInputs(4 | |
NumOutputs (2, 4) | |
OpSchema::Cost | CostInferenceForDotProduct (const OperatorDef &def, const vector< TensorShape > &in) |
REGISTER_CPU_OPERATOR (SquaredL2Distance, SquaredL2DistanceOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (SquaredL2DistanceGradient, SquaredL2DistanceGradientOp< float, CPUContext >) | |
SetDoc (R"DOC( Given two input float tensors X, Y, and produces one output float tensor of the L2 difference between X and Y that is computed as ||(X - Y)^2 / 2||. )DOC").Input(0 | |
or input tensor | Input (1,"Y","1D or 2D input tensor (must have the same shape as X)").Output(0 |
OPERATOR_SCHEMA (SquaredL2DistanceGradient).NumInputs(3).NumOutputs(2) | |
REGISTER_GRADIENT (SquaredL2Distance, GetSquaredL2DistanceGradient) | |
REGISTER_CPU_OPERATOR (L1Distance, L1DistanceOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (L1DistanceGradient, L1DistanceGradientOp< float, CPUContext >) | |
SetDoc (R"DOC( Given two input float tensors X, Y, and produces one output float tensor of the L1 difference between X and Y, computed as L1(x,y) = sum over |x-y| )DOC").Input(0 | |
OPERATOR_SCHEMA (L1DistanceGradient).NumInputs(3).NumOutputs(2) | |
REGISTER_GRADIENT (L1Distance, GetL1DistanceGradient) | |
REGISTER_CPU_OPERATOR (DotProduct, DotProductOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (DotProductGradient, DotProductGradientOp< float, CPUContext >) | |
SetDoc (R"DOC( Given two input float tensors X, Y, and produces one output float tensor of the dot product between X and Y. )DOC").Input(0 | |
or input tensor output tensor | CostInferenceFunction (OpSchema::CostInferenceFunctionType(CostInferenceForDotProduct)) |
OPERATOR_SCHEMA (DotProductGradient).NumInputs(3).NumOutputs(2) | |
REGISTER_GRADIENT (DotProduct, GetDotProductGradient) | |
REGISTER_CPU_OPERATOR (CosineSimilarity, CosineSimilarityOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (CosineSimilarityGradient, CosineSimilarityGradientOp< float, CPUContext >) | |
SetDoc (R"DOC( Given two input float tensors X, Y, and produces one output float tensor of the cosine similarity between X and Y. )DOC").Input(0 | |
OPERATOR_SCHEMA (CosineSimilarityGradient).NumInputs(3).NumOutputs(2) | |
REGISTER_GRADIENT (CosineSimilarity, GetCosineSimilarityGradient) | |
REGISTER_CPU_OPERATOR (DotProductWithPadding, DotProductWithPaddingOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (DotProductWithPaddingGradient, DotProductWithPaddingGradientOp< float, CPUContext >) | |
Y with different shapes and produces one output float tensor of the dot product between X and Y We currently support two kinds of strategies to achieve this Before doing normal dot_product pad the smaller | tensor (using pad_value) to the same shape as the other one.2) replicate the smaller tensor to the same shape as the other one.Note the first dimension of X |
Y with different shapes and produces one output float tensor of the dot product between X and Y We currently support two kinds of strategies to achieve this Before doing normal dot_product pad the smaller Y must be equal Only the second dimension of X or Y can be padded DOC | Input (0,"X","1D or 2D input tensor").Input(1 |
Y with different shapes and produces one output float tensor of the dot product between X and Y We currently support two kinds of strategies to achieve this Before doing normal dot_product pad the smaller Y must be equal Only the second dimension of X or Y can be padded DOC or input tensor | Output (0,"Z","1D output tensor").IdenticalTypeAndShapeOfInputDim(0 |
Y with different shapes and produces one output float tensor of the dot product between X and Y We currently support two kinds of strategies to achieve this Before doing normal dot_product pad the smaller Y must be equal Only the second dimension of X or Y can be padded DOC or input tensor | Arg ("pad_value","the padding value for tensors with smaller dimension").Arg("replicate" |
OPERATOR_SCHEMA (DotProductWithPaddingGradient).NumInputs(3).NumOutputs(2) | |
REGISTER_GRADIENT (DotProductWithPadding, GetDotProductWithPaddingGradient) | |
REGISTER_CPU_OPERATOR (Do, DoOp< CPUContext >) | |
INT_MAX | SetDoc (R"DOC( 'Do' control operator, executes a subnet in a separate workspace. Last blobs in the input and output lists should be the same blob created with CreateScope op. Arguments 'inner_blobs' and 'outer_blobs_idx' provide a mapping between selected inner blob names and corresponding outer blob indices. )DOC").Arg("net" |
INT_MAX Subnet with blob bindings | Arg ("inner_blobs","List of inner net blob names to bind to outer workspace").Arg("outer_blobs_idx" |
INT_MAX Subnet with blob bindings Indices of corresponding outer workspace in | operator outputs (skipping workspace blobs)") .Arg( "saved_fwd_blobs" |
INT_MAX Subnet with blob bindings Indices of corresponding outer workspace in List of blobs from the forward Do | operator workspace needed" "in backward pass, used in gradient Do operator") .Arg ("reuse_workspace","Whether to reuse workspace or create a new one in a given scope").AllowInplace([](int in |
REGISTER_CUDA_OPERATOR (Do, DoOp< CUDAContext >) | |
REGISTER_CPU_OPERATOR (Dropout, DropoutOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (DropoutGrad, DropoutGradientOp< float, CPUContext >) | |
AllowInplace ({{0, 0}}).TensorInferenceFunction([](const OperatorDef &def | |
if (output_mask) | |
SetDoc (R"DOC( Dropout takes one input data (Tensor<float>) and produces two Tensor outputs, output (Tensor<float>) and mask (Tensor<bool>). Depending on whether it is in test mode or not, the output Y will either be a random dropout, or a simple copy of the input. Note that our implementation of Dropout does scaling in the training phase, so during testing nothing needs to be done. )DOC").Arg("ratio" | |
default the ratio of random dropout | ArgIsTest ("(int) if nonzero, run dropout in test mode where ""the output is simply Y = X.").Input(0 |
default the ratio of random dropout The input data as Tensor | Output (0,"output","The output.").Output(1 |
default the ratio of random dropout The input data as Tensor The output mask If is_test is this output is not filled | InheritOnnxSchema ("Dropout") |
REGISTER_GRADIENT (Dropout, GetDropoutGradient) | |
EIGEN_FUNCTOR (Add, EIGEN_ADD, NumericTypes, SameTypeAsInput) | |
EIGEN_FUNCTOR (Div, EIGEN_DIV, NumericTypes, SameTypeAsInput) | |
void | ElementWiseDivide (CPUContext &, const int n, float *dXdata, float *dYdata, const float *dZdata, const float *Ydata, const float *Zdata) |
REGISTER_CPU_OPERATOR (DivGradient, DivGradientOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (ElementwiseLinear, ElementwiseLinearOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (ElementwiseLinearGradient, ElementwiseLinearGradientOp< float, CPUContext >) | |
w of size D and b of size the op computes Y of | size (N X D) where Y_ |
REGISTER_GRADIENT (ElementwiseLinear, GetElementwiseLinearGradient) | |
EIGEN_FUNCTOR (Mul, EIGEN_MUL, NumericTypes, SameTypeAsInput) | |
NAIVE_FUNCTOR (LT, NAIVE_LT, NumericTypes, FixedType< bool >) | |
NAIVE_FUNCTOR (LE, NAIVE_LE, NumericTypes, FixedType< bool >) | |
NAIVE_FUNCTOR (GT, NAIVE_GT, NumericTypes, FixedType< bool >) | |
NAIVE_FUNCTOR (GE, NAIVE_GE, NumericTypes, FixedType< bool >) | |
NAIVE_FUNCTOR (EQ, NAIVE_EQ, IntBoolTypes, FixedType< bool >) | |
NAIVE_FUNCTOR (And, NAIVE_AND, BoolTypes, FixedType< bool >) | |
NAIVE_FUNCTOR (Or, NAIVE_OR, BoolTypes, FixedType< bool >) | |
NAIVE_FUNCTOR (Xor, NAIVE_XOR, BoolTypes, FixedType< bool >) | |
REGISTER_CPU_OPERATOR (Not, UnaryElementwiseOp< BoolTypes, CPUContext, NotFunctor >) | |
REGISTER_CPU_OPERATOR (SumReduceLike, SumReduceLikeOp< CPUContext >) | |
template<typename Context > | |
std::tuple< size_t, size_t, size_t > | calculate_broadcast_sizes (const Tensor< Context > &A, const Tensor< Context > &B, int axis) |
std::function< void(OpSchema &)> | MathDocGenerator (const char *name) |
CostInferenceFunction (PointwiseCostInference< 1 >).IdenticalTypeAndShapeOfInput(0).FillUsing(MathDocGenerator("addition")).InheritOnnxSchema("Add") | |
OPERATOR_SCHEMA (DivGradient).NumInputs(3).NumOutputs(2).AllowInplace( | |
and the dimensions of the second input is the contiguous subset of the dimensions of the first For the following tensor shapes are | shape (B) |
and the dimensions of the second input is the contiguous subset of the dimensions of the first For the following tensor shapes are i e B is a scalar | shape (A) |
EIGEN_FUNCTOR (Sub, EIGEN_SUB, NumericTypes, SameTypeAsInput) | |
REGISTER_CPU_OPERATOR (Sum, SumOp< CPUContext >) | |
CostInferenceFunction (CostInferenceForSum).InputsCanCrossDevices().IdenticalTypeAndShapeOfInput(0).SetDoc(R"DOC( Element-wise sum of each of the input tensors. The first input tensor can be used in-place as the output tensor | |
in which case the sum will be done in place and results will be accumulated in input0 All inputs and outputs must have the same shape and data type DOC | Input (0,"data_0","First of the input tensors. Can be inplace.").Output(0 |
in which case the sum will be done in place and results will be accumulated in input0 All inputs and outputs must have the same shape and data type DOC Output tensor Same dimension as inputs | InheritOnnxSchema ("Sum") |
REGISTER_CPU_OPERATOR (Elu, EluOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (EluGradient, EluGradientOp< float, CPUContext >) | |
is applied to the tensor elementwise DOC | Input (0,"X","1D input tensor").Output(0 |
is applied to the tensor elementwise DOC input tensor | InheritOnnxSchema ("Elu") |
SetDoc (R"DOC( EluGradient takes both Y and dY and uses this to update dX according to the chain rule and derivatives of the rectified linear function. )DOC") | |
REGISTER_GRADIENT (Elu, GetEluGradient) | |
REGISTER_CPU_OPERATOR (Exp, UnaryElementwiseOp< TensorTypes< float >, CPUContext, ExpCPUFunctor >) | |
element wise This operation can be done in an in place fashion by providing the same input and output blobs DOC The exponential of the input tensor computed element wise | InheritOnnxSchema ("Exp") |
REGISTER_GRADIENT (Exp, GetExpGradient) | |
REGISTER_CPU_OPERATOR (ExpandDims, ExpandDimsOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (Squeeze, SqueezeOp< CPUContext >) | |
TensorInferenceFunction ([](const OperatorDef &def, const vector< TensorShape > &in){ArgumentHelper helper(def);auto dims=helper.template GetRepeatedArgument< int >("dims");auto originalSize=dims.size();CAFFE_ENFORCE(originalSize > 0,"Parameter `dims` must be provided.");std::sort(dims.begin(), dims.end());dims.erase(std::unique(dims.begin(), dims.end()), dims.end());if(dims.size()< originalSize){LOG(WARNING)<< "Parameter `dims` has repeated dimensions.";}CAFFE_ENFORCE(dims.front() >=0,"Dimension ids must be non-negative.");CAFFE_ENFORCE_GE(in[0].dims_size()+dims.size(), dims.back()+1,"Input needs at least ",(1+dims.back()-dims.size())," dimensions given `dims`.");vector< TensorShape > out(1);int cur_pos=0;int idx=0;for(const auto new_dim:dims){for(int i=cur_pos;i< new_dim;i++){out[0].add_dims(in[0].dims(idx++));}out[0].add_dims(1);cur_pos=new_dim+1;}for(;idx< in[0].dims_size();idx++){out[0].add_dims(in[0].dims(idx));}out[0].set_data_type(in[0].data_type());return out;}).SetDoc(R"DOC( Insert single-dimensional entries to the shape of a tensor. Takes one required argument `dims` | |
REGISTER_CUDA_OPERATOR (Squeeze, SqueezeOp< CUDAContext >) | |
REGISTER_CUDA_OPERATOR (ExpandDims, ExpandDimsOp< CUDAContext >) | |
std::vector< TensorShape > | FCShapeInference (const OperatorDef &def, const vector< TensorShape > &in, bool pretransposed_weight) |
OpSchema::Cost | CostInferenceForFC (const OperatorDef &def, const vector< TensorShape > &in) |
REGISTER_CPU_OPERATOR (FeedBlob, FeedBlobOp< CPUContext >) | |
SHOULD_NOT_DO_GRADIENT (FeedBlob) | |
NumInputs (0, 0).NumOutputs(1 | |
SetDoc (R"DOC( FeedBlobs the content of the blobs. The input and output blobs should be one-to-one inplace.)DOC").Arg("value" | |
REGISTER_CPU_OPERATOR (UniformFill, UniformFillOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (UniformIntFill, UniformFillOp< int, CPUContext >) | |
REGISTER_CPU_OPERATOR (UniqueUniformFill, UniqueUniformFillOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (ConstantFill, ConstantFillOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (DiagonalFill, DiagonalFillOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (GaussianFill, GaussianFillOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (XavierFill, XavierFillOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (MSRAFill, MSRAFillOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (RangeFill, RangeFillOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (LengthsRangeFill, LengthsRangeFillOp< CPUContext >) | |
TensorInferenceFunction (FillerTensorInference<>).SetDoc(R"DOC( The operator fills the elements of the output tensor with a const ant value specified by the 'value' argument. The data type is specified by the 'dtype' argument. The 'dtype' argument must be one of the data types specified in the 'DataType' enum field in the TensorProto message. If the 'dtype' argument is not provided | |
template<int VALUE_TYPE = TensorProto_DataType_FLOAT> | |
std::vector< TensorShape > | FillerTensorInference (const OperatorDef &def, const vector< TensorShape > &in) |
REGISTER_CUDA_OPERATOR (LengthsRangeFill, GPUFallbackOp< LengthsRangeFillOp< CPUContext >>) | |
Index (integers)") .Input(1 | |
Needles query | Output (0,"query_indices","Indices of the needles in index or 'missing value'").Arg("missing_value" |
Needles query Placeholder for items that are not found | SetDoc (R"DOC( Finds elements of second input from first input, outputting the last (max) index for each query. If query not find, inserts missing_value. See IndexGet() for a version that modifies the index when values are not found. )DOC") |
REGISTER_CPU_OPERATOR (Flatten, FlattenOp< CPUContext >) | |
vector< TensorShape > | out (1) |
for (auto d:in[0].dims()) | |
out[0] | set_data_type (in[0].data_type()) |
out[0] | add_dims (outer) |
out[0] | add_dims (inner) |
SetDoc (R"DOC( Flattens the input tensor into a 2D matrix. If input tensor has shape (d_0, d_1, ... d_n) then the output will have shape (d_0 X d_1 ... d_(axis-1), d_axis X d_(axis+1) ... X dn) )DOC").Input(0 | |
A tensor of with input dimensions up to axis flattened to the outer dimension of the output and remaining input dimensions flattened into the inner dimension of the output | Arg ("axis","(Default to 1) Indicate up to which input dimensions ""(exclusive) should be flattened to the outer dimension of the output").InheritOnnxSchema("Flatten") |
REGISTER_GRADIENT (Flatten, GetFlattenGradient) | |
REGISTER_CPU_OPERATOR (FlexibleTopK, FlexibleTopKOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (FlexibleTopKGradient, FlexibleTopKGradientOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (Floor, FloorOp< float, CPUContext >) | |
SetDoc (R"DOC( Floor takes one input data (Tensor<T>) and produces one output data (Tensor<T>) where the floor function, y = floor(x), is applied to the tensor elementwise. Currently supports only float32. )DOC").Input(0 | |
GRADIENT_NOT_IMPLEMENTED_YET (Floor) | |
REGISTER_CPU_OPERATOR (Free, FreeOp< CPUContext >) | |
SHOULD_NOT_DO_GRADIENT (Free) | |
INT_MAX | SameNumberOfOutput ().EnforceOneToOneInplace().SetDoc(R"DOC( Frees the content of the blobs. The input and output blobs should be one-to-one inplace.)DOC") |
REGISTER_CUDA_OPERATOR (Free, FreeOp< CUDAContext >) | |
REGISTER_CPU_OPERATOR (FC, FullyConnectedOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (FCGradient, FullyConnectedGradientOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (FCTransposed, FullyConnectedOp< CPUContext, DefaultEngine, false >) | |
REGISTER_CPU_OPERATOR (FCTransposedGradient, FullyConnectedGradientOp< CPUContext, DefaultEngine, false >) | |
REGISTER_CUDA_OPERATOR (FC, FullyConnectedOp< CUDAContext >) | |
REGISTER_CUDA_OPERATOR (FCGradient, FullyConnectedGradientOp< CUDAContext >) | |
REGISTER_CUDA_OPERATOR (FCTransposed, FullyConnectedOp< CUDAContext, DefaultEngine, false >) | |
REGISTER_CUDA_OPERATOR (FCTransposedGradient, FullyConnectedGradientOp< CUDAContext, DefaultEngine, false >) | |
REGISTER_CPU_OPERATOR (FloatToFused8BitRowwiseQuantized, FloatToFused8BitRowwiseQuantizedOp< CPUContext >) | |
and then scaling each element to an bit number between and To later de quantize the | scale (range/255) and offset(bias) are stored alongside the data.More precisely |
and then scaling each element to an bit number between and To later de quantize the the first bytes of each row in the output matrix are a bit float storing the the next bytes store the bias as a bit and all remaining bytes in the row encode single quantized values DOC | Input (0,"input","Float32 input data").Output(0 |
NO_GRADIENT (FloatToFused8BitRowwiseQuantized) | |
REGISTER_CPU_OPERATOR (Fused8BitRowwiseQuantizedToFloat, Fused8BitRowwiseQuantizedToFloatOp< CPUContext >) | |
followed by the bias as a bit float in the next and the quantized values in the preceding bytes of the row The output is a matrix containing only the but de quantized De quantization is performed by multiplying each value by its row s scale and bias parameters The de quantized values will thus not be exactly equal to the un quantized floating point values DOC | Input (0,"scale_bias_quantized_input","Fused scale, bias and quantized data").Output(0 |
NO_GRADIENT (Fused8BitRowwiseQuantizedToFloat) | |
but operating on bit rowwise quantized matrices with fused | storage (where each row stores quantized values, and then the scale and offset).DATA needs to have rank 2 and INDICES needs to have rank 1.) DOC") .Input( 0 |
but operating on bit rowwise quantized matrices with fused uint8 tensor with rank obtained with | operator FloatToFused8BitRowwiseQuantized") .Input (1,"INDICES","Integer vector containing indices of the first dimension of DATA for""the rows that are being gathered").Output(0 |
but operating on bit rowwise quantized matrices with fused uint8 tensor with rank obtained with output | TensorInferenceFunction ([](const OperatorDef &def, const vector< TensorShape > &in){vector< TensorShape > out(1);for(auto d:in[1].dims()){out[0].add_dims(d);}for(int i=1;i< in[0].dims_size();++i){out[0].add_dims(in[0].dims(i));}out[0].set_data_type(in[0].data_type());return out;}) |
REGISTER_CPU_OPERATOR (GatherFused8BitRowwise, GatherFused8BitRowwiseOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (GivenTensorFill, GivenTensorFillOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (GivenTensorDoubleFill, GivenTensorFillOp< double, CPUContext >) | |
REGISTER_CPU_OPERATOR (GivenTensorBoolFill, GivenTensorFillOp< bool, CPUContext >) | |
REGISTER_CPU_OPERATOR (GivenTensorIntFill, GivenTensorFillOp< int, CPUContext >) | |
REGISTER_CPU_OPERATOR (GivenTensorInt64Fill, GivenTensorFillOp< int64_t, CPUContext >) | |
REGISTER_CPU_OPERATOR (GivenTensorStringFill, GivenTensorFillOp< std::string, CPUContext >) | |
NO_GRADIENT (GivenTensorFill) | |
NO_GRADIENT (GivenTensorDoubleFill) | |
NO_GRADIENT (GivenTensorBoolFill) | |
NO_GRADIENT (GivenTensorIntFill) | |
NO_GRADIENT (GivenTensorInt64Fill) | |
NO_GRADIENT (GivenTensorStringFill) | |
TensorInferenceFunction (FillerTensorInference< TensorProto_DataType_DOUBLE >) | |
TensorInferenceFunction (FillerTensorInference< TensorProto_DataType_BOOL >) | |
TensorInferenceFunction (FillerTensorInference< TensorProto_DataType_INT32 >) | |
TensorInferenceFunction (FillerTensorInference< TensorProto_DataType_INT64 >) | |
TensorInferenceFunction (FillerTensorInference< TensorProto_DataType_STRING >) | |
REGISTER_CPU_OPERATOR (GRUUnit, GRUUnitOp< float, CPUContext >) | |
in a sequence length aware fashion given | the (fused) inputs X(TxNxD) |
in a sequence length aware fashion given the previous hidden | state (NxD) |
in a sequence length aware fashion given the previous hidden and the sequence | lengths (N) |
in a sequence length aware fashion given the previous hidden and the sequence computes the GRU avoiding computation if the input is | invalid (as in, the value at X[t][n] >=seqLengths[n].) DOC") .Arg( "drop_states" |
in a sequence length aware fashion given the previous hidden and the sequence computes the GRU avoiding computation if the input is Bool to determine if hidden state is zeroes or passed along for timesteps past the given sequence_length | Arg ("sequence_lengths","When false, the sequence lengths input is left out, ""and all following inputs are shifted left by one.").Output(0 |
REGISTER_CPU_OPERATOR (GRUUnitGradient, GRUUnitGradientOp< float, CPUContext >) | |
NumInputs (5, 6).NumOutputs(2).Arg("sequence_lengths" | |
REGISTER_GRADIENT (GRUUnit, GetGRUUnitGradient) | |
out | push_back (X) |
out[0] | set_data_type (TensorProto_DataType_FLOAT16) |
out[0] | set_data_type (TensorProto_DataType_FLOAT) |
The value for the elements of the output tensor | Arg ("shape","The shape of the output tensor.").Output(0 |
REGISTER_GRADIENT (FloatToHalf, GetFloatToHalfGradient) | |
REGISTER_GRADIENT (HalfToFloat, GetHalfToFloatGradient) | |
NO_GRADIENT (Float16ConstantFill) | |
std::vector< TensorShape > | Float16FillerTensorInference (const OperatorDef &def, const vector< TensorShape > &in) |
REGISTER_CPU_OPERATOR (If, IfOp< CPUContext >) | |
INT_MAX | SetDoc (R"DOC( 'If' control operator, first input is a scalar boolean blob that stores condition value. Accepts 'then_net' (required) and 'else_net' (optional) arguments for 'then' and 'else' subnets respectively. Subnets are executed in the same workspace as 'If'. )DOC").Arg("then_net" |
INT_MAX Net executed when condition is true | Arg ("else_net","Net executed when condition is false (optional)").Input(0 |
INT_MAX Net executed when condition is true Scalar boolean condition | AllowInplace ([](int in, int out) -> bool{return true;}) |
REGISTER_CUDA_OPERATOR (If, IfOp< CUDAContext >) | |
REGISTER_CPU_OPERATOR (Im2Col, Im2ColOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (Col2Im, Col2ImOp< float, CPUContext >) | |
REGISTER_GRADIENT (Im2Col, GetIm2ColGradient) | |
REGISTER_GRADIENT (Col2Im, GetCol2ImGradient) | |
switch (order) | |
CAFFE_ENFORCE (H >=dkernel_h) | |
CAFFE_ENFORCE (W >=dkernel_w) | |
Input (0,"X","4-tensor in NCHW or NHWC.").Output(0 | |
OPERATOR_SCHEMA (Col2Im).NumInputs(2).NumOutputs(1) | |
REGISTER_CUDA_OPERATOR (Im2Col, Im2ColOp< float, CUDAContext >) | |
REGISTER_CUDA_OPERATOR (Col2Im, Col2ImOp< float, CUDAContext >) | |
REGISTER_CPU_OPERATOR (IntIndexCreate, IndexCreateOp< int32_t >) | |
REGISTER_CPU_OPERATOR (LongIndexCreate, IndexCreateOp< int64_t >) | |
REGISTER_CPU_OPERATOR (StringIndexCreate, IndexCreateOp< std::string >) | |
REGISTER_CPU_OPERATOR (IndexGet, IndexGetOp) | |
REGISTER_CPU_OPERATOR (IndexLoad, IndexLoadOp) | |
REGISTER_CPU_OPERATOR (IndexStore, IndexStoreOp) | |
REGISTER_CPU_OPERATOR (IndexFreeze, IndexFreezeOp) | |
REGISTER_CPU_OPERATOR (IndexSize, IndexSizeOp) | |
Max number of including the zero entry | Output (0,"handler","Pointer to an Index instance.") |
Max number of including the zero entry | Output (0,"handle","Pointer to an Index instance.") |
return an Int tensor of same shape containing the indices for each of the keys If the index is unknown entries are given index new entries are added into the index If an insert is necessary but max_elements has been fail DOC | Input (0,"handle","Pointer to an Index instance.").Input(1 |
return an Int tensor of same shape containing the indices for each of the keys If the index is unknown entries are given index new entries are added into the index If an insert is necessary but max_elements has been fail DOC Tensor of keys to be looked up | Output (0,"indices","Indices for each of the keys.") |
disallowing creation of new index entries Should not be called concurrently with IndexGet DOC The input handle | EnforceInplace ({{0, 0}}) |
Pointer to an Index instance | Input (1,"items","1-D tensor with elements starting with index 1.").Output(0 |
Pointer to an Index instance | Output (0,"items","Scalar int64 tensor with number of entries.") |
NO_GRADIENT (IndexGetOp) | |
NO_GRADIENT (IntIndexCreate) | |
NO_GRADIENT (LongIndexCreate) | |
NO_GRADIENT (StringIndexCreate) | |
SHOULD_NOT_DO_GRADIENT (IndexFreeze) | |
SHOULD_NOT_DO_GRADIENT (IndexLoad) | |
SHOULD_NOT_DO_GRADIENT (IndexStore) | |
SHOULD_NOT_DO_GRADIENT (IndexSize) | |
CAFFE_KNOWN_TYPE (std::unique_ptr< caffe2::IndexBase >) | |
REGISTER_BLOB_SERIALIZER ((TypeMeta::Id< std::unique_ptr< caffe2::IndexBase >>()), IndexSerializer) | |
REGISTER_BLOB_DESERIALIZER (std::unique_ptr< caffe2::IndexBase >, IndexDeserializer) | |
REGISTER_CPU_OPERATOR (InstanceNormGradient, InstanceNormGradientOp< float, CPUContext >) | |
OPERATOR_SCHEMA (InstanceNormGradient).NumInputs(4 | |
NumOutputs (3) | |
REGISTER_GRADIENT (InstanceNorm, GetInstanceNormGradient) | |
REGISTER_CPU_OPERATOR (InstanceNorm, InstanceNormOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (BernoulliJSD, BernoulliJSDOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (BernoulliJSDGradient, BernoulliJSDGradientOp< float, CPUContext >) | |
array of probabilities for prediction | Input (0,"T","array of probabilities for target").Output(0 |
OPERATOR_SCHEMA (BernoulliJSDGradient).NumInputs(3).NumOutputs(1) | |
REGISTER_GRADIENT (BernoulliJSD, GetBernoulliJSDGradient) | |
REGISTER_CPU_OPERATOR (KeySplit, KeySplitOp< int64_t, CPUContext >) | |
NO_GRADIENT (KeySplitOp) | |
OPERATOR_SCHEMA (KeySplit).NumInputs(1).NumOutputs(1 | |
REGISTER_CPU_OPERATOR (LayerNorm, LayerNormOp< CPUContext >) | |
OPERATOR_SCHEMA (LayerNormGradient).NumInputs(5).NumOutputs(1) | |
REGISTER_CPU_OPERATOR (LayerNormGradient, LayerNormGradientOp< CPUContext >) | |
REGISTER_GRADIENT (LayerNorm, GetLayerNormGradient) | |
std::vector< int > | input_dims (input_dims_long.begin(), input_dims_long.end()) |
ArgumentHelper | helper (def) |
std::vector< int > | stat_dims (input_dims.begin(), input_dims.begin()+canonical_axis) |
stat_dims | push_back (1) |
SetDoc (R"DOC( Computes layer normalization as described in https://arxiv.org/pdf/1607.06450.pdf. Given an input vector x \in [a_0, a_1, ...,a_{k-1}, a_k, ..., a_{n-1}], this op treats dimensions a_k through a_{n-1} as feature vectors. For each feature vector, the op contains the mean and standard deviation. Then, it returns the normalized values (with respect to the feature vector). Note that this op does not contain the scale an bias terms described in the paper. Simply follow this op with an FC op to add those. Concretely, this op implements: h = \frac{1}{\sigma}(a - \mu) where \mu = \frac{1}{H}\sum_{i=1}^{H} a_i and \sigma = \sqrt{\frac{1}{H}\sum_{i=1}^{H}(a_i - \mu)^2} where H is the number of hidden units (i.e. product of dimensions from 'axis' to the end.) )DOC").Arg("axis" | |
Describes axis of the inputs Defaults to one because the axis most likely describes the batch size | Arg ("epsilon","(float) default to 0.001. Small value to be added to the stdev when"" dividing out by that value. This prevents division by zero.").Input(0 |
Describes axis of the inputs Defaults to one because the axis most likely describes the batch size Input tensor which layer normalization will be applied to | Output (0,"output","Normalized values").Output(1 |
Describes axis of the inputs Defaults to one because the axis most likely describes the batch size Input tensor which layer normalization will be applied to Mean values for each feature vector | Output (2,"stddev","Standard deviations for each feature vector") |
REGISTER_CPU_OPERATOR (LeakyRelu, LeakyReluOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (LeakyReluGradient, LeakyReluGradientOp< float, CPUContext >) | |
Coefficient of default value is and produces one output | data (Tensor< T >) where the function`f(x) |
Arg ("alpha","Coefficient of leakage").InheritOnnxSchema("LeakyRelu") | |
REGISTER_GRADIENT (LeakyRelu, GetLeakyReluGradient) | |
REGISTER_CPU_OPERATOR (SparseLengthsSumFused8BitRowwise, SparseLengthsFused8BitRowwiseOp< CPUContext >) | |
but operating on bit rowwise quantized matrices with fused | storage (where each row stores quantized values, and then 4-byte scale and 4-byte bias).) DOC") .Input( 0 |
but operating on bit rowwise quantized matrices with fused uint8 tensor obtained with | operator FloatToFused8BitRowwiseQuantized") .Input (1,"INDICES","Integer vector containing indices of the first ""dimension of DATA for the slices that are being aggregated").Input(2 |
but operating on bit rowwise quantized matrices with fused uint8 tensor obtained with Vector with the same sum of elements as the first dimension of DATA | Output (0,"output","output") |
NO_GRADIENT (SparseLengthsSumFused8BitRowwise) | |
REGISTER_CPU_OPERATOR (SparseLengthsWeightedSumFused8BitRowwise, SparseLengthsFused8BitRowwiseOp< CPUContext, true >) | |
but operating on bit rowwise quantized matrices with fused uint8 tensor obtained with Vector with the same sum of elements as the first dimension of DATA | Input (3,"WEIGHTS","Vector of weights to scale rows of DATA with before reduction").Output(0 |
NO_GRADIENT (SparseLengthsWeightedSumFused8BitRowwise) | |
REGISTER_CPU_OPERATOR (SparseLengthsMeanFused8BitRowwise, SparseLengthsFused8BitRowwiseOp< CPUContext, false, true >) | |
NO_GRADIENT (SparseLengthsMeanFused8BitRowwise) | |
REGISTER_CPU_OPERATOR_STR ("SparseLengthsSum", CPUSparseLengthsReductionOp< float, TensorTypes< float, float16 >, 0, 0 >) | |
REGISTER_CPU_OPERATOR_STR ("SparseLengthsWeightedSum", CPUSparseLengthsReductionOp< float, TensorTypes< float, float16 >, 1, 0 >) | |
REGISTER_CPU_OPERATOR_STR ("SparseLengthsMean", CPUSparseLengthsReductionOp< float, TensorTypes< float, float16 >, 0, 1 >) | |
for each weights are accessed by where L is the length of given row This is basically a fused | operator of LengthsRangeFill+Gather+SparseWeightedSum) DOC") .Input (0,"DATA","uint8 tensor obtained with ""operator FloatToRowwiseQuantized8Bits").Input(1 |
for each weights are accessed by where L is the length of given row This is basically a fused Scalar multipliers for the input slices Must be a vector with the length matching the length of DATA | Input (2,"INDICES","Integer vector containing indices of the first ""dimension of DATA for the slices that are being aggregated").Input(3 |
REGISTER_CPU_OPERATOR_STR ("SparseLengthsPositionalWeightedSum", CPUSparseLengthsReductionOp< float, TensorTypes< float, float16 >, 1, 0, 1 >) | |
REGISTER_CPU_OPERATOR (Rowwise8BitQuantizedToFloat, Rowwise8BitQuantizedToFloatOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (FloatToRowwiseQuantized8Bits, FloatToRowwiseQuantized8BitsOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (SparseLengthsSum8BitsRowwise, SparseLengths8BitsRowwiseOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (SparseLengthsWeightedSum8BitsRowwise, SparseLengths8BitsRowwiseOp< CPUContext, 1 >) | |
REGISTER_CPU_OPERATOR (SparseLengthsMean8BitsRowwise, SparseLengths8BitsRowwiseOp< CPUContext, 0, 1 >) | |
REGISTER_CPU_OPERATOR (SparseLengthsWeightedMean8BitsRowwise, SparseLengths8BitsRowwiseOp< CPUContext, 1, 1 >) | |
NumInputs (5).NumOutputs(1).SetDoc(R"DOC( Variation of SparseLengthsWeightedSum operator | |
reshape it into matrix of | size (m_1, m_2 x...x m_n) and apply row-wise quantization.After this |
NO_GRADIENT (Rowwise8BitQuantizedToFloat) | |
NO_GRADIENT (FloatToRowwiseQuantized8Bits) | |
NO_GRADIENT (SparseLengthsSum8BitsRowwise) | |
NO_GRADIENT (SparseLengthsWeightedSum8BitsRowwise) | |
NO_GRADIENT (SparseLengthsMean8BitsRowwise) | |
NO_GRADIENT (SparseLengthsWeightedMean8BitsRowwise) | |
REGISTER_CPU_OPERATOR (LengthsTile, LengthsTileOp< CPUContext >) | |
REGISTER_CUDA_OPERATOR (LengthsTile, LengthsTileOp< CUDAContext >) | |
REGISTER_CPU_OPERATOR (LengthsTopK, LengthsTopKOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (LengthsTopKGradient, LengthsTopKGradientOp< float, CPUContext >) | |
where segments are defined by their and concatenate them in an output tensor of the output value will be padded and the corresponding output indices will be padded by DOC | Input (0,"DATA","Tensor of rank 1. First dimension must be equal to the sum of ""lengths").Input(1 |
where segments are defined by their and concatenate them in an output tensor of the output value will be padded and the corresponding output indices will be padded by DOC Tensor of int32 lengths of rank | Output (0,"TopKValue","Output top k elements for each segment, with""shape=(SIZE(lengths), k)").Output(1 |
where segments are defined by their and concatenate them in an output tensor of the output value will be padded and the corresponding output indices will be padded by DOC Tensor of int32 lengths of rank Output indices in DATA corresponding to value in TopKValue | Arg ("k","the number of top values to return for each segment, if the number ""of values is smaller than k, the values would be padded with 0 and ""indices would be padded with -1.") |
OPERATOR_SCHEMA (LengthsTopKGradient).NumInputs(3).NumOutputs(1) | |
REGISTER_GRADIENT (LengthsTopK, GetLengthsTopKGradient) | |
REGISTER_CPU_OPERATOR (DBExists, DBExistsOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (Load, LoadOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (Save, SaveOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (Checkpoint, CheckpointOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (Snapshot, CheckpointOp< CPUContext >) | |
A scalar bool Tensor | Arg ("absolute_path","(int, default 0) if set, use the db path directly and do not prepend ""the current root folder of the workspace.").Arg("db_name" |
A scalar bool Tensor string the path to the db to load | Arg ("db_type","(string) the type of the db.") |
NumInputs (0, INT_MAX).NumOutputs(0 | |
INT_MAX | SetDoc (R"DOC( The Load operator loads a set of serialized blobs from a db or multiple dbs. It takes [0, infinity) number of inputs and [0, infinity) number of outputs, using the db keys to match the db entries with the outputs. If at least one input is passed, then it is assumed that that input blobs are a set of DBReaders to load from. Otherwise the db or dbs argument is used to load blobs from one single db or multiple dbs respectively. db_type argument is used to specify the type of the input db/dbs. )DOC").Arg("absolute_path" |
INT_MAX default if use the db path directly and do not prepend the current root folder of the workspace | Arg ("add_prefix","(string, default=\"\") blobs will be prefixed with this when loading.""Useful for avoiding collisions with blobs existing in the workspace.""The output blob names specified to this op should include this prefix.").Arg("strip_prefix" |
template<typename... Ts> | |
string | FormatString (const string &pattern, Ts...values) |
REGISTER_CUDA_OPERATOR (Load, LoadOp< CUDAContext >) | |
REGISTER_CUDA_OPERATOR (Save, SaveOp< CUDAContext >) | |
REGISTER_CUDA_OPERATOR (Checkpoint, CheckpointOp< CUDAContext >) | |
REGISTER_CPU_OPERATOR (LRN, LRNOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (LRNGradient, LRNGradientOp< float, CPUContext >) | |
OPERATOR_SCHEMA (LRN).NumInputs(1).NumOutputs(1 | |
InheritOnnxSchema ("LRN") | |
OPERATOR_SCHEMA (LRNGradient).NumInputs(3).NumOutputs(1) | |
REGISTER_GRADIENT (LRN, GetLRNGradient) | |
REGISTER_CPU_OPERATOR (LC, LocallyConnectedOp< float, CPUContext >) | |
NumInputs(2, 3).NumOutputs(1).TensorInferenceFunction(ConvPoolOpBase< CPUContext > | REGISTER_CPU_OPERATOR (LC1D, LocallyConnectedOp< float, CPUContext >) |
NumInputs(2, 3).NumOutputs(1).TensorInferenceFunction(ConvPoolOpBase< CPUContext > | REGISTER_CPU_OPERATOR (LC2D, LocallyConnectedOp< float, CPUContext >) |
NumInputs(2, 3).NumOutputs(1).TensorInferenceFunction(ConvPoolOpBase< CPUContext > | REGISTER_CPU_OPERATOR (LC3D, LocallyConnectedOp< float, CPUContext >) |
NumInputs(2, 3).NumOutputs(1).TensorInferenceFunction(ConvPoolOpBase< CPUContext > | REGISTER_CPU_OPERATOR (LCGradient, LocallyConnectedGradientOp< float, CPUContext >) |
OPERATOR_SCHEMA (LCGradient).NumInputs(2 | |
REGISTER_CPU_OPERATOR (LC1DGradient, LocallyConnectedGradientOp< float, CPUContext >) | |
OPERATOR_SCHEMA (LC1DGradient).NumInputs(2 | |
REGISTER_CPU_OPERATOR (LC2DGradient, LocallyConnectedGradientOp< float, CPUContext >) | |
OPERATOR_SCHEMA (LC2DGradient).NumInputs(2 | |
REGISTER_CPU_OPERATOR (LC3DGradient, LocallyConnectedGradientOp< float, CPUContext >) | |
OPERATOR_SCHEMA (LC3DGradient).NumInputs(2 | |
REGISTER_GRADIENT (LC, GetLocallyConnectedGradient) | |
REGISTER_GRADIENT (LC1D, GetLocallyConnectedGradient) | |
REGISTER_GRADIENT (LC2D, GetLocallyConnectedGradient) | |
REGISTER_GRADIENT (LC3D, GetLocallyConnectedGradient) | |
REGISTER_CUDA_OPERATOR (LC, LocallyConnectedOp< float, CUDAContext >) | |
REGISTER_CUDA_OPERATOR (LCGradient, LocallyConnectedGradientOp< float, CUDAContext >) | |
REGISTER_CUDA_OPERATOR (LC1D, LocallyConnectedOp< float, CUDAContext >) | |
REGISTER_CUDA_OPERATOR (LC1DGradient, LocallyConnectedGradientOp< float, CUDAContext >) | |
REGISTER_CUDA_OPERATOR (LC2D, LocallyConnectedOp< float, CUDAContext >) | |
REGISTER_CUDA_OPERATOR (LC2DGradient, LocallyConnectedGradientOp< float, CUDAContext >) | |
REGISTER_CUDA_OPERATOR (LC3D, LocallyConnectedOp< float, CUDAContext >) | |
REGISTER_CUDA_OPERATOR (LC3DGradient, LocallyConnectedGradientOp< float, CUDAContext >) | |
REGISTER_CPU_OPERATOR (Log, UnaryElementwiseOp< TensorTypes< float >, CPUContext, LogCPUFunctor >) | |
element wise This operation can be done in an in place fashion by providing the same input and output blobs DOC The natural log of the input tensor computed element wise | InheritOnnxSchema ("Log") |
REGISTER_GRADIENT (Log, GetLogGradient) | |
REGISTER_CPU_OPERATOR (Logit, UnaryElementwiseWithArgsOp< TensorTypes< float >, CPUContext, LogitCPUFunctor >) | |
REGISTER_CPU_OPERATOR (LogitGradient, LogitGradientOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (AveragedLoss, AveragedLoss< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (AveragedLossGradient, AveragedLossGradient< float, CPUContext >) | |
NumInputs(1).NumOutputs(1).ScalarType(TensorProto | OPERATOR_SCHEMA (AveragedLossGradient).NumInputs(2).NumOutputs(1) |
REGISTER_GRADIENT (AveragedLoss, GetAveragedLossGradient) | |
REGISTER_CPU_OPERATOR (LpPool, PoolOp< float, CPUContext, LpPool >) | |
REGISTER_CPU_OPERATOR (LpPoolGradient, PoolGradientOp< float, CPUContext, LpPool >) | |
stride and pad lengths defined by the ConvPoolOpBase | operator.L-p pooling consisting of taking the L-p norm of a subset of the input tensor according to the kernel size and downsampling the data into the output blob Y for further processing.) DOC") .Input (0,"X","Input data tensor from the previous operator; dimensions ""depend on whether the NCHW or NHWC operators are being used. For example, ""in the former, the input has size (N x C x H x W), where N is the batch ""size, C is the number of channels, and H and W are the height and the width ""of the data. The corresponding permutation of dimensions is used in the ""latter case. ").Output(0 |
OPERATOR_SCHEMA (LpPoolGradient).NumInputs(3).NumOutputs(1) | |
REGISTER_GRADIENT (LpPool, GetPoolGradient) | |
REGISTER_CPU_OPERATOR (LSTMUnit, LSTMUnitOp< CPUContext >) | |
NumInputs (4, 5).NumOutputs(2).SetDoc(R"DOC( LSTMUnit computes the activations of a standard LSTM (without peephole connections) | |
in a sequence length aware fashion given the previous cell and the sequence computes the LSTM avoiding computation if the input is | invalid (as in, the value at X{t][n] >=seqLengths[n].) DOC") .Arg("forget_bias" |
REGISTER_CPU_OPERATOR (LSTMUnitGradient, LSTMUnitGradientOp< CPUContext >) | |
NumInputs (8, 9).NumOutputs(3).Arg("sequence_lengths" | |
REGISTER_GRADIENT (LSTMUnit, GetLSTMUnitGradient) | |
CAFFE_KNOWN_TYPE (MapType64To64) | |
REGISTER_CPU_OPERATOR (MarginRankingCriterion, MarginRankingCriterionOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (MarginRankingCriterionGradient, MarginRankingCriterionGradientOp< CPUContext >) | |
X2 (Tensor< float >) | |
and label | Y (Tensor< int >) to produce the loss(Tensor< float >) where the loss function |
and label | loss (X1, X2, Y) |
REGISTER_CPU_OPERATOR (Sqr, UnaryElementwiseOp< TensorTypes< float >, CPUContext, SqrCPUFunctor >) | |
Input tensor | Output (0,"output","Squared elements of the input") |
REGISTER_GRADIENT (Sqr, GetSqrGradient) | |
REGISTER_CPU_OPERATOR (Sign, UnaryElementwiseOp< TensorTypes< float >, CPUContext, SignCPUFunctor >) | |
REGISTER_CPU_OPERATOR (MatMul, MatMulOp< float, CPUContext >) | |
if (trans_a) | |
if (trans_b) | |
out[0] | add_dims (M) |
out[0] | add_dims (N) |
SetDoc (R"DOC( Matrix multiplication Y = A * B, where A has size (M x K), B has size (K x N), and Y will have a size (M x N). )DOC").Input(0 | |
matrix of | size (M x K)") .Input(1 |
matrix of matrix of | size (K x N)") .Output(0 |
matrix of matrix of matrix of Exclusive axis that divides the first and second dimension of matrix default to | Arg ("axis_b","Exclusive axis that divides the first and second dimension \ of matrix B, default to 1").Arg("trans_a" |
REGISTER_GRADIENT (MatMul, GetMatMulGradient) | |
REGISTER_CUDA_OPERATOR (MatMul, MatMulOp< float, CUDAContext >) | |
REGISTER_CPU_OPERATOR (Mean, MeanOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (MeanGradient, MeanGradientOp< CPUContext >) | |
SetDoc (R"DOC( Element-wise mean of each of the input tensors. The first input tensor can be used in-place as the output tensor, in which case the mean will be done in place and results will be accumulated in input0. All inputs and outputs must have the same shape and data type. )DOC").Input(0 | |
First of the input tensors Can be inplace | Output (0,"mean","Output tensor. Same dimension as inputs.") |
REGISTER_GRADIENT (Mean, GetMeanGradient) | |
REGISTER_CPU_OPERATOR (MaxGradient, MaxGradientOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (MinGradient, MinGradientOp< float, CPUContext >) | |
OPERATOR_SCHEMA (MaxGradient).NumInputs(3 | |
INT_MAX | NumOutputs (1, INT_MAX) |
OPERATOR_SCHEMA (MinGradient).NumInputs(3 | |
REGISTER_GRADIENT (Max, GetMaxGradient) | |
REGISTER_GRADIENT (Min, GetMinGradient) | |
REGISTER_CPU_OPERATOR (Max, MaxOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (Min, MinOp< float, CPUContext >) | |
SetDoc (R"DOC( Element-wise max of each of the input tensors. The first input tensor can be used in-place as the output tensor, in which case the max will be done in place and results will be accumulated in input0. All inputs and outputs must have the same shape and data type. )DOC").Input(0 | |
First of the input tensors Can be inplace | Output (0,"max","Output tensor. Same dimension as inputs.").InheritOnnxSchema("Max") |
SetDoc (R"DOC( Element-wise min of each of the input tensors. The first input tensor can be used in-place as the output tensor, in which case the min will be done in place and results will be accumulated in input0. All inputs and outputs must have the same shape and data type. )DOC").Input(0 | |
First of the input tensors Can be inplace | Output (0,"min","Output tensor. Same dimension as inputs.").InheritOnnxSchema("Min") |
REGISTER_CPU_OPERATOR (MultiClassAccuracy, MultiClassAccuracyOp< float, CPUContext >) | |
D float | tensor (N, D,) of predicted scores of each class for" "each data.N is the number of instances |
D float i batch size D is number of possible classes labels | Input (1,"labels","1-D int tensor (N,) of labels for each instance.").Output(0 |
D float i batch size D is number of possible classes labels D float | tensor (D,) of accuracy for each class.If a class has no" "instance in the batch |
D float i batch size D is number of possible classes labels D float its accuracy score is set to zero | Output (1,"amounts","1-D int tensor (D,) of number of instances for each class in the batch.") |
SHOULD_NOT_DO_GRADIENT (MultiClassAccuracy) | |
REGISTER_CPU_OPERATOR (NegateGradient, NegateGradientOp< CPUContext >) | |
SetDoc (R"DOC( NegagteGradient operator in forward pass simply copies input to the output, and in backward pass, flips the sign of the output gradient )DOC") | |
REGISTER_GRADIENT (NegateGradient, GetNegateGradientGradient) | |
REGISTER_CPU_OPERATOR (Negative, UnaryElementwiseOp< TensorTypes< float, double, int, long >, CPUContext, NegativeCPUFunctor >) | |
input tensor | Output (0,"Y","1D input tensor").InheritOnnxSchema("Neg") |
REGISTER_GRADIENT (Negative, GetNegativeGradient) | |
REGISTER_CPU_OPERATOR (NGramFromCategorical, NGramFromCategoricalOp< float, int64_t, CPUContext >) | |
NO_GRADIENT (NGramFromCategorical) | |
OPERATOR_SCHEMA (NGramFromCategorical).NumInputs(1).NumOutputs(1) | |
REGISTER_CPU_OPERATOR (NormalizeL1, NormalizeL1Op< float, CPUContext >) | |
axis to normalize | SetDoc (R"DOC( Given a matrix, apply L1-normalization along the specified axis. )DOC") |
REGISTER_CPU_OPERATOR (Normalize, NormalizeOp< float, CPUContext >) | |
axis to normalize | SetDoc (R"DOC( Given a matrix, apply L2-normalization along the specified dimension. )DOC").IdenticalTypeAndShape() |
REGISTER_CPU_OPERATOR (NormalizeGradient, NormalizeGradientOp< float, CPUContext >) | |
REGISTER_GRADIENT (Normalize, GetNormalizeGradient) | |
vector< TensorShape > | TensorInferenceForBatchOneHot (const OperatorDef &, const vector< TensorShape > &in) |
OpSchema::Cost | CostInferenceForBatchOneHot (const OperatorDef &def, const vector< TensorShape > &in) |
REGISTER_CPU_OPERATOR (BatchBucketOneHot, BatchBucketOneHotOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (BatchOneHot, BatchOneHotOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (OneHot, OneHotOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (SegmentOneHot, SegmentOneHotOp) | |
REGISTER_CPU_OPERATOR (ONNXWhile, ONNXWhileOp< CPUContext >) | |
INT_MAX | SetDoc (R"DOC( *** EXPERIMENTAL. This operator is a work-in-progress. No assumption should be made about the stability or correctness of this op. *** Generic Looping construct confirming to the ONNX Loop operator spec. This loop has multiple termination conditions: 1. Trip count. Iteration count specified at runtime. Set by specifying the input M. Optional. Set to empty string to omit. Note that a static trip count (specified at graph construction time) can be specified by passing in a constant node for input M. 2. Loop termination condition. This is an input to the op that determines whether to run the first interation and also a loop-carried dependency for the body graph. The body graph must yield a value for the condition variable, whether this input is provided or not. This table summarizes the operating modes of this operator with equivalent C-style code: Operator inputs defined as (max_trip_count, condition_var). Omitted optional inputs are represented as empty string. Concretely, in this caffe2 op an input is marked as omitted by setting its 'has_{name}' argument to False. input ("", ""): for (int i=0; ; ++i) { cond = ... // Note this value is ignored, but is required in the body } input ("", cond) // Note this is analogous to a while loop bool cond = ...; for (int i=0; cond; ++i) { cond = ...; } input ("", 1) // Note this is analogous to a do-while loop bool cond = true for (int i=0; cond; ++i) { cond = ...; } input (trip_count, "") // Note this is analogous to a for loop int trip_count = ... for (int i=0; i < trip_count; ++i) { cond = ...; // ignored } input (trip_count, cond) int trip_count = ...; bool cond = ...; for (int i=0; i < trip_count && cond; ++i) { cond = ...; } )DOC").Arg("loop_net" |
INT_MAX Net executed on each iteration | Input (0,"condition","Scalar boolean condition").AllowInplace([](int in |
REGISTER_CPU_OPERATOR (NHWC2NCHW, NHWC2NCHWOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (NCHW2NHWC, NCHW2NHWCOp< float, CPUContext >) | |
out[0] | add_dims (in[0].dims(3)) |
out[0] | add_dims (in[0].dims(1)) |
out[0] | add_dims (in[0].dims(2)) |
SetDoc (R"DOC( The operator switches the order of data in a tensor from NHWC- sample index N, height H, width H and channels C, to the NCHW order. )DOC").Input(0 | |
The input | data (Tensor< float >) in the NHWC order.") .Output( 0 |
The input The output | tensor (Tensor< float >) in the NCHW order.") |
OPERATOR_SCHEMA (NCHW2NHWC).NumInputs(1).NumOutputs(1).SetDoc(R"DOC( The operator switches the order of data in a tensor from NCHW- sample index N | |
channels height H and width to the NHWC order DOC | Input (0,"data","The input data (Tensor<float>) in the NCHW order.").Output(0 |
REGISTER_GRADIENT (NHWC2NCHW, GetNHWC2NCHWGradient) | |
REGISTER_GRADIENT (NCHW2NHWC, GetNCHW2NHWCGradient) | |
REGISTER_CPU_OPERATOR (PackSegments, PackSegmentsOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (UnpackSegments, UnpackSegmentsOp< CPUContext >) | |
SetDoc ("Map N dim tensor to N+1 dim based on length blob. Sequences that \ are shorter than the longest sequence are padded with zeros.").Input(0 | |
d int long tensor contains the length in each of the output | Input (1,"tensor","N dim Tensor.").Output(0 |
d int long tensor contains the length in each of the output N dim Tensor where | dim (1) is the max length" " |
d int long tensor contains the length in each of the output N dim Tensor where | dim (0) is the batch size.") .Output( 1 |
d int long tensor contains the length in each of the output N dim Tensor where dim boolean false where packed_tensor is true otherwise | Arg ("pad_minf","Padding number in the packed segments. Use true to pad \ -infinity, otherwise pad zeros").Arg("return_presence_mask" |
d int long tensor contains the length in each of the input | Input (1,"tensor","N+1 dim Tensor.").Output(0 |
REGISTER_GRADIENT (PackSegments, GetPackSegmentsGradient) | |
REGISTER_GRADIENT (UnpackSegments, GetUnpackSegmentsGradient) | |
PadMode | StringToPadMode (const string &mode) |
REGISTER_CPU_OPERATOR (PadImage, PadImageOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (PadImageGradient, PadImageGradientOp< float, CPUContext >) | |
CPUContext::PadTensorInference | SetDoc (R"DOC( PadImage pads values around the boundary of an image according to the pad values and stride sizes defined by the ConvPoolOpBase operator. )DOC").Input(0 |
dimensions depend on whether the NCHW or NHWC operators are being used For in the the input has | size (N x C x H x W) |
dimensions depend on whether the NCHW or NHWC operators are being used For in the the input has where N is the batch C is the number of and H and W are the height and the width of the data The corresponding permutation of dimensions is used in the latter case | Output (0,"Y","Output data tensor from padding the H and W dimensions on ""the tensor. Dimensions will vary based on various pad and stride ""sizes.") |
OPERATOR_SCHEMA (PadImageGradient).NumInputs(1).NumOutputs(1) | |
REGISTER_GRADIENT (PadImage, GetPadImageGradient) | |
REGISTER_CPU_OPERATOR (Percentile, PercentileOp< CPUContext >) | |
given a sample set of raw labeled with their corresponding percentiles from the same distribution In this | operator takes as input a tensor of floats to find the percentile values for, a 2D tensor of floats, where the first column of the tensor represents sampled values, and the second column represents the percentile labels, and a tensor of integers lengths.This lengths tensor is used because the operator works on multiple sets of raw values at the same time.For example, for an input:original_values=[[3, 5, 3], [5, 1, 6]], lengths=[2, 1, 1], value_to_pct=[[3, 0.2],[5, 0.5],[1, 0.3],[3.0.6]] Our operator expects that each column i of the input tensor is sampled from distribution i.Lengths tells us that the first two elements in value_to_pct are sampled from distribution 1, the next is from distribution two, and the last is from distribution 3.We expect the output of our operator to give us[[0.2, 1.0, 0.6],[0.5, 0.3, 1.0]].To calculate the percentile of an element, we check to see if its value is already mapped to a percentile in value_to_pct.If so, we return that value.If not, we linearly interpolate between the two closest values in value_to_pct.If the value is larger than all values in value_to_pct, we return 1.If it's smaller than all the values, we return 0.) DOC") .Input (0,"original_values","Input 2D tensor of floats, representing the original, raw data to calculate percentiles for.").Input(1 |
given a sample set of raw labeled with their corresponding percentiles from the same distribution In this Sorted with columns Each element in the first column is a float representing the raw value of a sample Its corresponding element in the next column represents the percentile it maps to | Input (2,"lengths","1D tensor, representing the length of each distribution. We expect that the sum of elements of this tensor"" is equal to the total length of value_to_pct.").Output(0 |
NO_GRADIENT (Percentile) | |
REGISTER_CPU_OPERATOR (Perplexity, PerplexityOp< float, CPUContext >) | |
OPERATOR_SCHEMA (Perplexity).NumInputs(1).NumOutputs(1).SetDoc(R"DOC( Perplexity calculates how well a probability distribution predicts a sample. Perplexity takes a 1-D tensor containing a batch of probabilities. Each value in the tensor belongs to a different sample and represents the probability of the model predicting the true label for that sample. The operator returns a single (float) perplexity value for the batch. )DOC").Input(0 | |
The input data as Tensor It contains a batch of true label or target probabilities | Output (0,"output","The output- a single (float) perplexity value for the ""batch") |
SHOULD_NOT_DO_GRADIENT (Perplexity) | |
REGISTER_CPU_OPERATOR (PiecewiseLinearTransform, PiecewiseLinearTransformOp< float, CPUContext >) | |
NumInputs (1, 4).NumOutputs(1).SetDoc(R"DOC( PiecewiseLinearTransform takes inputs -- predictions | |
a D or D slopes and intercepts The output tensor has the same shape of input predictions and contains the predictions transformed by the piecewise linear functions Each column of predictions has its own piecewise linear transformation functions Therefore the size of piecewise function parameters are pieces x except for binary predictions where only the positive prediction needs them Note that in each low bound is excluded while high bound is included Also the piecewise linear function must be continuous Notes If the input is binary | predictions (Nx2 or Nx1 tensor) |
a D or D slopes and intercepts The output tensor has the same shape of input predictions and contains the predictions transformed by the piecewise linear functions Each column of predictions has its own piecewise linear transformation functions Therefore the size of piecewise function parameters are pieces x except for binary predictions where only the positive prediction needs them Note that in each low bound is excluded while high bound is included Also the piecewise linear function must be continuous Notes If the input is binary set the binary arg to true so that one group of piecewise linear functions is | needed (see details below).-The transform parameters(bounds |
a D or D slopes and intercepts The output tensor has the same shape of input predictions and contains the predictions transformed by the piecewise linear functions Each column of predictions has its own piecewise linear transformation functions Therefore the size of piecewise function parameters are pieces x except for binary predictions where only the positive prediction needs them Note that in each low bound is excluded while high bound is included Also the piecewise linear function must be continuous Notes If the input is binary set the binary arg to true so that one group of piecewise linear functions is intercepts can be passed either through args or through input blobs If we have multiple groups of piecewise linear each group has the same number of pieces If a prediction is out of the it is capped to the smallest or largest bound DOC | Arg ("bounds","1-D vector of size (prediction_dimensions x (pieces+1)) contain the ""upper bounds of each piece of linear function. One special case is ""the first bound is the lower bound of whole piecewise function and we ""treat it the same as the left most functions. (bounds, slopes, ""intercepts) can be passed through either arg or input blobs.").Arg("slopes" |
a D or D slopes and intercepts The output tensor has the same shape of input predictions and contains the predictions transformed by the piecewise linear functions Each column of predictions has its own piecewise linear transformation functions Therefore the size of piecewise function parameters are pieces x except for binary predictions where only the positive prediction needs them Note that in each low bound is excluded while high bound is included Also the piecewise linear function must be continuous Notes If the input is binary set the binary arg to true so that one group of piecewise linear functions is intercepts can be passed either through args or through input blobs If we have multiple groups of piecewise linear each group has the same number of pieces If a prediction is out of the it is capped to the smallest or largest bound DOC D vector of | size (prediction_dimensions x pieces) containing the" "slopes of linear function") .Arg( "intercepts" |
REGISTER_CPU_OPERATOR (AveragePoolGradient, PoolGradientOp< float, CPUContext, AveragePool< float >>) | |
OPERATOR_SCHEMA (AveragePoolGradient).NumInputs(3).NumOutputs(1) | |
REGISTER_CPU_OPERATOR (AveragePool1DGradient, PoolGradientOp< float, CPUContext, AveragePool< float >>) | |
OPERATOR_SCHEMA (AveragePool1DGradient).NumInputs(3).NumOutputs(1) | |
REGISTER_CPU_OPERATOR (AveragePool2DGradient, PoolGradientOp< float, CPUContext, AveragePool< float >>) | |
OPERATOR_SCHEMA (AveragePool2DGradient).NumInputs(3).NumOutputs(1) | |
REGISTER_CPU_OPERATOR (AveragePool3DGradient, PoolGradientOp< float, CPUContext, AveragePool< float >>) | |
OPERATOR_SCHEMA (AveragePool3DGradient).NumInputs(3).NumOutputs(1) | |
REGISTER_CPU_OPERATOR (MaxPoolGradient, PoolGradientOp< float, CPUContext, MaxPool< float >>) | |
OPERATOR_SCHEMA (MaxPoolGradient).NumInputs(3).NumOutputs(1) | |
REGISTER_CPU_OPERATOR (MaxPool1DGradient, PoolGradientOp< float, CPUContext, MaxPool< float >>) | |
OPERATOR_SCHEMA (MaxPool1DGradient).NumInputs(3).NumOutputs(1) | |
REGISTER_CPU_OPERATOR (MaxPool2DGradient, PoolGradientOp< float, CPUContext, MaxPool< float >>) | |
OPERATOR_SCHEMA (MaxPool2DGradient).NumInputs(3).NumOutputs(1) | |
REGISTER_CPU_OPERATOR (MaxPool3DGradient, PoolGradientOp< float, CPUContext, MaxPool< float >>) | |
OPERATOR_SCHEMA (MaxPool3DGradient).NumInputs(3).NumOutputs(1) | |
REGISTER_GRADIENT (AveragePool, GetPoolGradient) | |
REGISTER_GRADIENT (AveragePool1D, GetPoolGradient) | |
REGISTER_GRADIENT (AveragePool2D, GetPoolGradient) | |
REGISTER_GRADIENT (AveragePool3D, GetPoolGradient) | |
REGISTER_GRADIENT (MaxPool, GetPoolGradient) | |
REGISTER_GRADIENT (MaxPool1D, GetPoolGradient) | |
REGISTER_GRADIENT (MaxPool2D, GetPoolGradient) | |
REGISTER_GRADIENT (MaxPool3D, GetPoolGradient) | |
std::function< void(OpSchema &)> | AveragePoolDocGenerator (const char *dim) |
std::function< void(OpSchema &)> | MaxPoolDocGenerator (const char *dim) |
REGISTER_CPU_OPERATOR (AveragePool, PoolOp< float, CPUContext, AveragePool< float >>) | |
NumInputs(1).NumOutputs(1).TensorInferenceFunction(ConvPoolOpBase< CPUContext > | REGISTER_CPU_OPERATOR (AveragePool1D, PoolOp< float, CPUContext, AveragePool< float >>) |
NumInputs(1).NumOutputs(1).TensorInferenceFunction(ConvPoolOpBase< CPUContext > | REGISTER_CPU_OPERATOR (AveragePool2D, PoolOp< float, CPUContext, AveragePool< float >>) |
NumInputs(1).NumOutputs(1).TensorInferenceFunction(ConvPoolOpBase< CPUContext > | REGISTER_CPU_OPERATOR (AveragePool3D, PoolOp< float, CPUContext, AveragePool< float >>) |
NumInputs(1).NumOutputs(1).TensorInferenceFunction(ConvPoolOpBase< CPUContext > | REGISTER_CPU_OPERATOR (MaxPool, PoolOp< float, CPUContext, MaxPool< float >>) |
NumInputs(1).NumOutputs(1).TensorInferenceFunction(ConvPoolOpBase< CPUContext > | REGISTER_CPU_OPERATOR (MaxPool1D, PoolOp< float, CPUContext, MaxPool< float >>) |
NumInputs(1).NumOutputs(1).TensorInferenceFunction(ConvPoolOpBase< CPUContext > | REGISTER_CPU_OPERATOR (MaxPool2D, PoolOp< float, CPUContext, MaxPool< float >>) |
NumInputs(1).NumOutputs(1).TensorInferenceFunction(ConvPoolOpBase< CPUContext > | REGISTER_CPU_OPERATOR (MaxPool3D, PoolOp< float, CPUContext, MaxPool< float >>) |
REGISTER_CPU_OPERATOR (Pow, PowOp< TensorTypes< float >, CPUContext, EigenPowFunctor, SameTypeAsInput >).NumInputs(1 | |
NumOutputs (1).Arg("exponent" | |
The exponent of the power function | AllowInplace ({{0, 0},{1, 0}}).IdenticalTypeAndShapeOfInput(0).SetDoc(R"DOC( Pow takes input data (Tensor<T>) and an argument exponent |
REGISTER_CPU_OPERATOR (PRelu, PReluOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (PReluGradient, PReluGradientOp< float, CPUContext >) | |
OPERATOR_SCHEMA (PReluGradient).NumInputs(4).NumOutputs(2).SetDoc(R"DOC( PReluGradient takes both Y and dY and uses this to update dX and dW according to the chain rule and derivatives of the rectified linear function. )DOC") | |
REGISTER_GRADIENT (PRelu, GetPReluGradient) | |
REGISTER_CPU_OPERATOR (PrependDim, PrependDimOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (MergeDim, MergeDimOp< CPUContext >) | |
SetDoc (R"DOC( Reshape the tensor by prepending a dimension of fixed size and dividing the size of the next dimension by that amount. )DOC").Arg("dim_size" | |
Size of the dimension to prepend | Input (0,"data","An input tensor.").Output(0 |
SetDoc (R"DOC( Merge first two dimensions in a single dimension with size dim(0) * dim(1). )DOC").Input(0 | |
An input tensor | Output (0,"reshaped","Reshaped tensor.") |
REGISTER_GRADIENT (PrependDim, GetPrependDimGradient) | |
REGISTER_CUDA_OPERATOR (PrependDim, PrependDimOp< CUDAContext >) | |
REGISTER_CUDA_OPERATOR (MergeDim, MergeDimOp< CUDAContext >) | |
REGISTER_CPU_OPERATOR (QuantDecode, QuantDecodeOp< QuantDecodeRunTy::RUN_ALWAYS >) | |
REGISTER_CPU_OPERATOR (QuantDecodeGradient, QuantDecodeGradientOp) | |
vector< TIndex > | ConvertFromInputIndex (TIndex index, vector< TIndex > &dims) |
TIndex | ConvertToOutputIndex (const vector< int > &axes, const vector< TIndex > &nd_idx, vector< TIndex > &dims) |
template<typename T > | |
T | Add (T x, T y) |
template<typename T , class Context > | |
void | ComputeOp (const T *X_data, const TIndex X_size, vector< TIndex > &dims, T *Y_data, vector< int > &axes, int keepdims, T(*binary_op)(T, T)) |
REGISTER_CPU_OPERATOR (ReduceSum, ReduceSumOp< float, CPUContext >) | |
then the resulted tensor have the reduced dimension pruned DOC | Arg ("axes","A list of integers, along which to reduce.").Arg("keepdims" |
then the resulted tensor have the reduced dimension pruned DOC Keep the reduced | dimension (s) or not |
then the resulted tensor have the reduced dimension pruned DOC Keep the reduced default keeps the reduced An input tensor | Output (0,"reduced","Reduced output tensor.") |
GRADIENT_NOT_IMPLEMENTED_YET (ReduceSum) | |
REGISTER_CPU_OPERATOR (ReduceMean, ReduceMeanOp< float, CPUContext >) | |
GRADIENT_NOT_IMPLEMENTED_YET (ReduceMean) | |
REGISTER_CPU_OPERATOR (ReduceFrontSum, SumReduceDimsOp< CPUContext, true, false >) | |
REGISTER_CPU_OPERATOR (ReduceFrontSumGradient, SumReduceDimsGradientOp< CPUContext, true, false >) | |
REGISTER_GRADIENT (ReduceFrontSum, GetReduceFrontSumGradient) | |
REGISTER_CPU_OPERATOR (ReduceBackSum, SumReduceDimsOp< CPUContext, false, false >) | |
REGISTER_CPU_OPERATOR (ReduceBackSumGradient, SumReduceDimsGradientOp< CPUContext, false, false >) | |
REGISTER_GRADIENT (ReduceBackSum, GetReduceBackSumGradient) | |
Number of dimensions to reduce | SetDoc (R"DOC( Reduces the input tensor along the first dimension of the input tensor by applying 'Sum'. When lengths is given, sum is only computed with subsets of elements correspondingly. )DOC").Input(0 |
Number of dimensions to reduce T< D1..., Dn > Input data | Input (1,"lengths","Num of elements in each sample, should have size D2 x D3 x ... x Dn.").TensorInferenceFunction([](const OperatorDef &def |
OPERATOR_SCHEMA (ReduceFrontSumGradient).NumInputs(2 | |
Number of dimensions to reduce | SetDoc (R"DOC( Reduces the input tensor along the last dimension of the input tensor by applying 'Sum'. When lengths is given, sum is only computed with subsets of elements correspondingly. )DOC").Input(0 |
Number of dimensions to reduce T< D1..., Dn > Input data | Input (1,"lengths","Num of elements in each sample, should have size D1 x D2 x ... x D(n-1).").TensorInferenceFunction([](const OperatorDef &def |
OPERATOR_SCHEMA (ReduceBackSumGradient).NumInputs(2 | |
REGISTER_CPU_OPERATOR (ReduceFrontMean, SumReduceDimsOp< CPUContext, true, true >) | |
REGISTER_CPU_OPERATOR (ReduceFrontMeanGradient, SumReduceDimsGradientOp< CPUContext, true, true >) | |
REGISTER_GRADIENT (ReduceFrontMean, GetReduceFrontMeanGradient) | |
Number of dimensions to reduce | SetDoc (R"DOC( Reduces the input tensor along the first dimension of the input tensor by applying 'Mean'. When lengths is given, mean is only computed with subsets of elements correspondingly. )DOC").Input(0 |
OPERATOR_SCHEMA (ReduceFrontMeanGradient).NumInputs(2 | |
REGISTER_CPU_OPERATOR (ReduceBackMean, SumReduceDimsOp< CPUContext, false, true >) | |
REGISTER_CPU_OPERATOR (ReduceBackMeanGradient, SumReduceDimsGradientOp< CPUContext, false, true >) | |
REGISTER_GRADIENT (ReduceBackMean, GetReduceBackMeanGradient) | |
Number of dimensions to reduce | SetDoc (R"DOC( Reduces the input tensor along the last dimension of the input tensor by applying 'Mean'. When lengths is given, mean is only computed with subsets of elements correspondingly. )DOC").Input(0 |
OPERATOR_SCHEMA (ReduceBackMeanGradient).NumInputs(2 | |
REGISTER_CPU_OPERATOR (ReduceFrontMax, MaxReduceDimsOp< float, CPUContext, true >) | |
REGISTER_CPU_OPERATOR (ReduceFrontMaxGradient, MaxReduceDimsGradientOp< float, CPUContext, true >) | |
REGISTER_CPU_OPERATOR (ReduceBackMax, MaxReduceDimsOp< float, CPUContext, false >) | |
REGISTER_CPU_OPERATOR (ReduceBackMaxGradient, MaxReduceDimsGradientOp< float, CPUContext, false >) | |
REGISTER_GRADIENT (ReduceFrontMax, GetReduceFrontMaxGradient) | |
REGISTER_GRADIENT (ReduceBackMax, GetReduceBackMaxGradient) | |
Number of dimensions to reduce | SetDoc (R"DOC( Reduces the input tensor along the first dimension of the input tensor by applying 'Max'. When lengths is given, max is only computed with subsets of elements correspondingly. )DOC").Input(0 |
Number of dimensions to reduce T< D1..., Dn > Input data | Input (1,"lengths","Num of elements in each sample, should have size D2 x D3 ... x Dn.").TensorInferenceFunction([](const OperatorDef &def |
OPERATOR_SCHEMA (ReduceFrontMaxGradient).NumInputs(3 | |
Number of dimensions to reduce | SetDoc (R"DOC( Reduces the input tensor along the last dimension of the input tensor by applying 'Max'. When lengths is given, max is only computed with subsets of elements correspondingly. )DOC").Input(0 |
OPERATOR_SCHEMA (ReduceBackMaxGradient).NumInputs(3 | |
REGISTER_CPU_OPERATOR (SumElements, SumElementsOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (SumElementsInt, SumElementsIntOp< int, CPUContext >) | |
REGISTER_CPU_OPERATOR (SumSqrElements, SumSqrElementsOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (SumElementsGradient, SumElementsGradientOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (RowwiseMax, MaxReductionOp< float, CPUContext, true >) | |
REGISTER_CPU_OPERATOR (RowwiseMaxGradient, MaxReductionGradientOp< float, CPUContext, true >) | |
REGISTER_CPU_OPERATOR (ColwiseMaxGradient, MaxReductionGradientOp< float, CPUContext, false >) | |
REGISTER_CPU_OPERATOR (ColwiseMax, MaxReductionOp< float, CPUContext, false >) | |
NumInputs(1).NumOutputs(1).ScalarType(TensorProto NumInputs(1).NumOutputs(1).ScalarType(TensorProto | SHOULD_NOT_DO_GRADIENT (SumElementsInt) |
NumInputs(1).NumOutputs(1).ScalarType(TensorProto | OPERATOR_SCHEMA (SumElementsGradient).NumInputs(2).NumOutputs(1) |
REGISTER_GRADIENT (SumElements, GetSumElementsGradient) | |
A tenosr of dimensions batch_size x M x N to compute rowwise max | Output (0,"Y","batch_size x M rowwise-max results matrix.") |
OPERATOR_SCHEMA (RowwiseMaxGradient).NumInputs(3).NumOutputs(1) | |
REGISTER_GRADIENT (RowwiseMax, GetRowwiseMaxGradient) | |
OPERATOR_SCHEMA (ColwiseMaxGradient) | |
A tenosr of dimensions batch_size x M x N to compute colwise max | Output (0,"Y","batch_size x N column-max results matrix.") |
OPERATOR_SCHEMA (ColumnMaxGradient).NumInputs(3).NumOutputs(1) | |
REGISTER_GRADIENT (ColwiseMax, GetColwiseMaxGradient) | |
REGISTER_CPU_OPERATOR (Relu, ReluOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (ReluGradient, ReluGradientOp< float, CPUContext >) | |
CostInferenceFunction (CostInferenceForRelu).IdenticalTypeAndShape().SetDoc(R"DOC( Relu takes one input data (Tensor<T>) and produces one output data (Tensor<T>) where the rectified linear function | |
is applied to the tensor elementwise DOC input tensor | InheritOnnxSchema ("Relu") |
SetDoc (R"DOC( ReluGradient takes both Y and dY and uses this to update dX according to the chain rule and derivatives of the rectified linear function. )DOC") | |
REGISTER_GRADIENT (Relu, GetReluGradient) | |
REGISTER_GRADIENT (ReluFp16, GetReluGradient) | |
REGISTER_CPU_OPERATOR (ReplaceNaN, ReplaceNaNOp< CPUContext >) | |
SHOULD_NOT_DO_GRADIENT (ReplaceNaN) | |
REGISTER_CPU_OPERATOR (Reshape, ReshapeOp< float, CPUContext >) | |
out[1] | set_data_type (TensorProto::INT64) |
out[1] | add_dims (in[0].dims_size()) |
if (!helper.HasArgument("shape")) | |
CAFFE_ENFORCE_EQ (in.size(), 1,"New shape must not be specified by the input blob and the ""argument `shape` at the same time.") | |
for (int i=0;i< actualNewShape.size();++i) | |
if (unknownIdx!=-1) | |
for (const auto d:actualNewShape) | |
an extra argument shape must be specified It outputs the reshaped tensor as well as the original shape At most one dimension of the new shape can be In this the value is inferred from the size of the tensor and the remaining dimensions A dimension could also in which case the actual dimension value is going to be copied from the input tensor DOC | Arg ("shape","New shape").Input(0 |
an extra argument shape must be specified It outputs the reshaped tensor as well as the original shape At most one dimension of the new shape can be In this the value is inferred from the size of the tensor and the remaining dimensions A dimension could also in which case the actual dimension value is going to be copied from the input tensor DOC An input tensor | Input (1,"new_shape","New shape.").Output(0 |
an extra argument shape must be specified It outputs the reshaped tensor as well as the original shape At most one dimension of the new shape can be In this the value is inferred from the size of the tensor and the remaining dimensions A dimension could also in which case the actual dimension value is going to be copied from the input tensor DOC An input tensor Reshaped data | Output (1,"old_shape","Original shape.").InheritOnnxSchema("Reshape") |
REGISTER_GRADIENT (Reshape, GetReshapeGradient) | |
REGISTER_CUDA_OPERATOR (Reshape, ReshapeOp< float, CUDAContext >) | |
void | resizeNearest2x (int batch_size, int num_channels, int input_height, int input_width, const float *input, float *output) |
REGISTER_CPU_OPERATOR (ResizeNearest, ResizeNearestOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (ResizeNearestGradient, ResizeNearestGradientOp< float, CPUContext >) | |
Scale along width dimension | Arg ("height_scale","Scale along height dimension").SetDoc(R"DOC( Resizes the spatial dimensions of the input using nearest neighbor interpolation. The `width_scale` and `height_scale` arguments control the size of the output |
Scale along width dimension which is given Input tensor | Output (0,"Y","Output tensor") |
REGISTER_GRADIENT (ResizeNearest, GetResizeNearestGradient) | |
REGISTER_CPU_OPERATOR (ReversePackedSegs, ReversePackedSegsOp< CPUContext >) | |
leaving paddings unchanged This | operator is used to reverse input of a recurrent neural network to make it a BRNN.) DOC") .Input (0,"data","a 3-D (lengths, segments, embeddings,) tensor.").Input(1 |
leaving paddings unchanged This length of each segment | Output (0,"reversed data","a (lengths, segments, embeddings,) tensor with each segment reversed""and paddings unchanged.") |
REGISTER_GRADIENT (ReversePackedSegs, GetReversePackedSegsGradient) | |
REGISTER_CPU_OPERATOR (RMACRegions, RMACRegionsOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (RecurrentNetworkBlobFetcher, RecurrentNetworkBlobFetcherOp< CPUContext >) | |
Prefix string to prepend extracted blobs | Input (0,"ScratchWorkspaceBlob","Name of scratch workspace blob returned by recurrent network.").Output(0 |
SHOULD_NOT_DO_GRADIENT (RecurrentNetworkBlobFetcher) | |
REGISTER_CUDA_OPERATOR (RecurrentNetworkBlobFetcher, RecurrentNetworkBlobFetcherOp< CUDAContext >) | |
template<> | |
std::unique_ptr< RecurrentNetworkExecutorBase > | createRNNExecutor< CPUContext > (const NetDef &step_net_def, std::map< string, string > &recurrent_input_map, std::string timestep_blob, ArgumentHelper rnn_args) |
Implementation of RecurrentNetworkExecutor that uses thread pool for multithreaded execution of RNNs. More... | |
template<class Context > | |
std::unique_ptr< RecurrentNetworkExecutorBase > | createRNNExecutor (const NetDef &step_net_def, std::map< string, string > &recurrent_input_map, std::string timestep_blob, ArgumentHelper rnn_args) |
template<> | |
std::unique_ptr< RecurrentNetworkExecutorBase > | createRNNExecutor< CUDAContext > (const NetDef &step_net_def, std::map< string, string > &recurrent_input_map, std::string timestep_blob, ArgumentHelper arg_helper) |
CAFFE_KNOWN_TYPE (detail::ScratchWorkspaces) | |
REGISTER_CPU_OPERATOR (RecurrentNetwork, RecurrentNetworkOp< CPUContext >) | |
INT_MAX | SetDoc (R"DOC( Run the input network in a recurrent fashion. This can be used to implement fairly general recurrent neural networks (RNNs). The operator proceeds as follows. - First, initialized the states from the input recurrent states - For each timestep T, apply the links (that map offsets from input/output tensors into the inputs/outputs for the `step` network) - Finally, alias the recurrent states to the specified output blobs. This is a fairly special-case meta-operator, and so the implementation is somewhat complex. It trades of generality (and frankly usability) against performance and control (compared to e.g. TF dynamic_rnn, Theano scan, etc). See the usage examples for a flavor of how to use it. )DOC") |
REGISTER_CPU_OPERATOR (RecurrentNetworkGradient, RecurrentNetworkGradientOp< CPUContext >) | |
OPERATOR_SCHEMA (RecurrentNetworkGradient) | |
REGISTER_CPU_OPERATOR (rnn_internal_accumulate_gradient_input, AccumulateInputGradientOp< CPUContext >) | |
INT_MAX | EnforceInplace ({{2, 0}}).Private().SetDoc(R"DOC( Internal RNN operator. )DOC") |
REGISTER_CPU_OPERATOR (rnn_internal_apply_link, RNNApplyLinkOp< CPUContext >) | |
Private ().SetDoc(R"DOC( Internal RNN operator. )DOC") | |
REGISTER_GRADIENT (RecurrentNetwork, GetRecurrentNetworkGradient) | |
REGISTER_CUDNN_OPERATOR (Recurrent, RecurrentOp< float >) | |
OPERATOR_SCHEMA (Recurrent).NumInputs(4).NumOutputs(5).SetDoc(R"DOC( Recurrent wraps the CuDNN R5 RNN implementation. See the CuDNN R5 documentation for more information. In general | |
the implementation takes an | input (TxNxD) tensor |
the implementation takes an the hidden state | input (NxD) |
the implementation takes an the hidden state the cell and a weight | tensor (effectively an opaque blob, where the size and layout is dictated by CuDNN).The outputs are the output(again |
the implementation takes an the hidden state the cell and a weight the final hidden cell | states (NxD).These can be reset(at sequence boundaries across minibatches) by multiplying by zero.The CuDNN arguments(hidden_size |
REGISTER_CUDNN_OPERATOR (RecurrentGradient, RecurrentGradientOp< float >) | |
NumInputs (7).NumOutputs(6).AllowInplace( | |
REGISTER_CUDNN_OPERATOR (RecurrentParamSet, RecurrentParamAccessOp< float, SET_PARAM >) | |
SetDoc ("Set individual parameters of a recurrent net.").Arg("param_type" | |
R | DOC (Type of param to be set:"input_gate_w","forget_gate_w","cell_w","output_gate_w""input_gate_b","forget_gate_b","cell_b","output_gate_b") DOC") .Arg("input_type" |
R recurrent or input | Arg ("layer","layer index (starting from 0)").Input(0 |
R recurrent or input R | DOC (Input blob.Needed for inferring the shapes.A dummy tensor matching the input shape is ok.) DOC") .Input(1 |
R recurrent or input R Blob holding all the parameters | Input (2,"param","Values for the specified parameter").Output(0 |
R recurrent or input R Blob holding all the parameters Blob holding all the | parameters (same as input(1))") |
REGISTER_CUDNN_OPERATOR (RecurrentParamGet, RecurrentParamAccessOp< float, GET_PARAM >) | |
R recurrent or input R Blob holding all the parameters | Output (0,"param","Blob holding the requested values") |
REGISTER_GRADIENT (Recurrent, GetRecurrentGradient) | |
REGISTER_CPU_OPERATOR (RoIAlignGradient, RoIAlignGradientOp< float, CPUContext >) | |
See RoIPoolF | Input (1,"RoIs","See RoIPoolF.").Input(2 |
See RoIPoolF Gradient of forward | output (Y)") .Output(0 |
See RoIPoolF Gradient of forward Gradient of forward | input (X)") |
REGISTER_GRADIENT (RoIAlign, GetRoIAlignGradient) | |
REGISTER_CPU_OPERATOR (RoIAlign, RoIAlignOp< float, CPUContext >) | |
Spatial scale of the input feature map X relative to the input image E if X has a stride of w r t the input image | Arg ("pooled_h","(int) default 1; Pooled output Y's height.").Arg("pooled_w" |
Pooled output Y s width | Arg ("sampling_ratio","(int) default -1; number of sampling points in the interpolation grid ""used to compute the output value of each pooled output bin. If > 0, ""then exactly sampling_ratio x sampling_ratio grid points are used. If ""<= 0, then an adaptive number of grid points are used (computed as ""ceil(roi_width / pooled_w), and likewise for height).").Input(0 |
Pooled output Y s width feature map input of | shape (N, C, H, W).") .Input( 1 |
REGISTER_CPU_OPERATOR (RoIPool, RoIPoolOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (RoIPoolGradient, RoIPoolGradientOp< float, CPUContext >) | |
TensorInferenceFunction ([](const OperatorDef &def, const vector< TensorShape > &in){ArgumentHelper helper(def);const StorageOrder order=StringToStorageOrder(helper.GetSingleArgument< string >("order","NCHW"));const TensorShape &X=in[0];const int num_channels=(order==StorageOrder::NCHW?X.dims(1):X.dims(3));const TensorShape &R=in[1];const int num_rois=R.dims(0);const int pooled_height=helper.GetSingleArgument< int >("pooled_h", 1);const int pooled_width=helper.GetSingleArgument< int >("pooled_w", 1);TensorShape Y=CreateTensorShape(vector< int >({num_rois, num_channels, pooled_height, pooled_width}), X.data_type());bool is_test=helper.GetSingleArgument< int >(OpSchema::Arg_IsTest, 0);if(!is_test){TensorShape argmaxes=Y;argmaxes.set_data_type(TensorProto_DataType_INT32);return vector< TensorShape >({Y, argmaxes});}else{return vector< TensorShape >({Y});}}).SetDoc(R"DOC( Carries out ROI Pooling for Faster-RCNN. Depending on the mode | |
there are multiple output | argmaxes (train mode) Output case) DOC") .Arg( "is_test" |
there are multiple output If run in test mode and skip computation of argmaxes(used for" "gradient computation).Only one output tensor is produced." "(Default | OPERATOR_SCHEMA (RoIPoolGradient).NumInputs(4).NumOutputs(1) |
REGISTER_GRADIENT (RoIPool, GetRoIPoolGradient) | |
REGISTER_CPU_OPERATOR (Scale, ScaleOp< CPUContext >) | |
REGISTER_GRADIENT (Scale, GetScaleGradient) | |
REGISTER_CUDA_OPERATOR (Scale, ScaleOp< CUDAContext >) | |
REGISTER_CPU_OPERATOR (SparseLengthsIndicesInGradientWeightedSumWithMainInputGradient, AbstractLengthsWithMainInputGradientOp< float, int, CPUContext, WeightedSumReducerDef::template ReducerGradient< float, CPUContext >, true, true >) | |
REGISTER_CPU_OPERATOR (SparseLengthsIndicesInGradientWeightedSumGradient, AbstractLengthsGradientOp< float, int, CPUContext, WeightedSumReducerDef::template ReducerGradient< float, CPUContext >, true >) | |
REGISTER_CPU_OPERATOR (SparseLengthsIndicesInGradientSumGradient, AbstractLengthsGradientOp< float, int, CPUContext, SumReducerDef::template ReducerGradient< float, CPUContext >, true >) | |
OPERATOR_SCHEMA (LengthsIndicesInGradientSumGradient).NumInputs(3).NumOutputs(1) | |
REGISTER_CPU_OPERATOR (LengthsIndicesInGradientSumGradient, AbstractLengthsGradientOp< float, int, CPUContext, SumReducerDef::template ReducerGradient< float, CPUContext >, true >) | |
REGISTER_CPU_OPERATOR (Selu, SeluOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (SeluGradient, SeluGradientOp< float, CPUContext >) | |
is applied to the tensor elementwise DOC | Arg ("alpha","(float) default to 1.6732~; affects the activation function itself. ""This should go with the weight initialization in the paper. "" See https://arxiv.org/abs/1706.02515 ").Arg("scale" |
affects the activation function itself | Input (0,"X","input tensor").Output(0 |
affects the activation function itself input tensor | InheritOnnxSchema ("Selu") |
SetDoc (R"DOC( SeluGradient takes both Y and dY and uses this to update dX according to the chain rule and derivatives of the selu function. )DOC").Arg("alpha" | |
affects the activation function itself | Input (0,"Y","input tensor").Input(1 |
REGISTER_GRADIENT (Selu, GetSeluGradient) | |
REGISTER_CPU_OPERATOR (AddPadding, AddPaddingOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (RemovePadding, RemovePaddingOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (GatherPadding, GatherPaddingOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (PadEmptySamples, PadEmptySamplesOp< CPUContext >) | |
REGISTER_GRADIENT (AddPadding, GetAddPaddingGradient) | |
REGISTER_GRADIENT (RemovePadding, GetRemovePaddingGradient) | |
SetDoc (R"DOC( Given a partitioned tensor T<N, D1..., Dn>, where the partitions are defined as ranges on its outer-most (slowest varying) dimension N, with given range lengths, return a tensor T<N + 2*padding_width, D1 ..., Dn> with paddings added to the start and end of each range. Optionally, different paddings can be provided for beginning and end. Paddings provided must be a tensor T<D1..., Dn>. If no padding is provided, add zero padding. If no lengths vector is provided, add padding only once, at the start and end of data. )DOC").Arg("padding_width" | |
Number of copies of padding to add around each range | Arg ("end_padding_width","(Optional) Specifies a different end-padding width.").Input(0 |
Number of copies of padding to add around each range T< N, D1..., Dn > Input data | Input (1,"lengths","(i64) Num of elements in each range. sum(lengths) = N.").Input(2 |
Number of copies of padding to add around each range T< N, D1..., Dn > Input data T< D1..., Dn > Padding data for range start | Input (3,"end_padding","T<D1..., Dn> (optional) Padding for range end. ""If not provided, start_padding is used as end_padding as well.").Output(0 |
Number of copies of padding to add around each range T< N, D1..., Dn > Input data T< D1..., Dn > Padding data for range start T< N+2 *padding_width, D1..., Dn > Padded data | Output (1,"lengths_out","(i64, optional) Lengths for each padded range.") |
SetDoc (R"DOC( Remove padding around the edges of each segment of the input data. This is the reverse opration of AddPadding, and uses the same arguments and conventions for input and output data format. )DOC").Arg("padding_width" | |
Outer size of padding to remove around each range T< N, D1..., Dn > Input data | Input (1,"lengths","(i64) Num of elements in each range. sum(lengths) = N. ""If not provided, considers all data as a single segment.").Output(0 |
Outer size of padding to remove around each range T< N, D1..., Dn > Input data T< N-2 *padding_width, D1..., Dn > Unpadded data | Output (1,"lengths_out","(i64, optional) Lengths for each unpadded range.") |
SetDoc (R"DOC( Gather the sum of start and end paddings in a padded input sequence. Used in order to compute the gradients of AddPadding w.r.t the padding tensors. )DOC").Arg("padding_width" | |
Outer size of padding present around each range T< N, D1..., Dn > Padded input data Sum of all start or of all paddings if end_padding_sum is not provided | Output (1,"end_padding_sum","T<D1..., Dn> Sum of all end paddings, if provided.") |
INT_MAX | SetDoc (R"DOC( Pad empty field given lengths and index features, Input(0) is a blob pointing to the lengths of samples in one batch, [Input(1),... Input(num_fields)] a list of tensors containing the data for each field of the features. PadEmptySamples is thread safe. )DOC").Input(0 |
INT_MAX A blob containing a pointer to the lengths | Output (0,"out_lengths","Tensor containing lengths with empty sample padded.") |
REGISTER_CPU_OPERATOR (Shape, ShapeOp< CPUContext >) | |
out[0] | add_dims (in[0].dims().size()) |
out[0] | set_data_type (TensorProto::INT32) |
SetDoc ("Produce a 1D int64 tensor with the shape of the input tensor.") | |
SHOULD_NOT_DO_GRADIENT (Shape) | |
REGISTER_CUDA_OPERATOR (Shape, ShapeOp< CUDAContext >) | |
REGISTER_CPU_OPERATOR (Sigmoid, UnaryElementwiseOp< TensorTypes< float >, CPUContext, SigmoidCPUFunctor >) | |
REGISTER_CPU_OPERATOR (SigmoidGradient, BinaryElementwiseOp< TensorTypes< float >, CPUContext, WithoutBroadcast< SigmoidGradientCPUFunctor >>) | |
is applied to the tensor elementwise DOC output tensor | InheritOnnxSchema ("Sigmoid") |
SetDoc (R"DOC( SigmoidGradient takes both Y and dY and uses this to update dX according to the chain rule and derivatives of the sigmoid function. )DOC") | |
REGISTER_GRADIENT (Sigmoid, GetSigmoidGradient) | |
REGISTER_CPU_OPERATOR (Sin, UnaryElementwiseOp< TensorTypes< float >, CPUContext, SinCPUFunctor >) | |
REGISTER_CPU_OPERATOR (SinGradient, BinaryElementwiseOp< TensorTypes< float >, CPUContext, WithoutBroadcast< SinGradientCPUFunctor >>) | |
OPERATOR_SCHEMA (SinGradient).NumInputs(2).NumOutputs(1).IdenticalTypeAndShape() | |
REGISTER_GRADIENT (Sin, GetSinGradient) | |
REGISTER_CPU_OPERATOR (SinusoidPositionEncoding, SinusoidPositionEncodingOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (Slice, SliceOp< int, CPUContext >) | |
REGISTER_CPU_OPERATOR (SliceGradient, SliceGradientOp< int, CPUContext >) | |
NumInputs (1, 3).NumOutputs(1).SetDoc(R"DOC( Produces a slice of the input tensor. Currently | |
REGISTER_CPU_OPERATOR (Softmax, SoftmaxOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (SoftmaxGradient, SoftmaxGradientOp< float, CPUContext >) | |
it will be coerced into one For an arbitrary n dimensional tensor X in[a_0, a_1,..., a_{k-1}, a_k,..., a_{n-1}] and k is the axis then X will be coerced into a dimensional tensor with dimensions[a_0 *...*a_{k-1}, a_k *...*a_{n-1}] For the default case where this means the X tensor will be coerced into a tensor of where a_0 is often the batch size In this we must have or else the | operator will throw errors.) DOC") .Arg ("axis","(int) default to 1; describes the axis of the inputs when coerced ""to 2D; defaults to one because the 0th axis most likely describes ""the batch_size").Input(0 |
it will be coerced into one For an arbitrary n dimensional tensor X in[a_0, a_1,..., a_{k-1}, a_k,..., a_{n-1}] and k is the axis then X will be coerced into a dimensional tensor with dimensions[a_0 *...*a_{k-1}, a_k *...*a_{n-1}] For the default case where this means the X tensor will be coerced into a tensor of where a_0 is often the batch size In this we must have or else the The input tensor that s coerced into a matrix of | size (NxD)" "as described above.") .Output(0 |
it will be coerced into one For an arbitrary n dimensional tensor X in[a_0, a_1,..., a_{k-1}, a_k,..., a_{n-1}] and k is the axis then X will be coerced into a dimensional tensor with dimensions[a_0 *...*a_{k-1}, a_k *...*a_{n-1}] For the default case where this means the X tensor will be coerced into a tensor of where a_0 is often the batch size In this we must have or else the The input tensor that s coerced into a matrix of The softmax normalized output values with the same shape as input tensor | InheritOnnxSchema ("Softmax") |
OPERATOR_SCHEMA (SoftmaxGradient).NumInputs(2).NumOutputs(1) | |
REGISTER_GRADIENT (Softmax, GetSoftmaxGradient) | |
REGISTER_GRADIENT (SoftmaxFp16, GetSoftmaxGradient) | |
void | SoftmaxCPU (CPUContext &context, const int N, const int D, const float *Xdata, float *Ydata, float *scale, const float *sum_multiplier, bool logarithmic, float *rowmax) |
REGISTER_CPU_OPERATOR (SoftmaxWithLoss, SoftmaxWithLossOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (SoftmaxWithLossGradient, SoftmaxWithLossGradientOp< float, CPUContext >) | |
vector< TensorShape > | out (2) |
out[0] | set_data_type (logits.data_type()) |
out[0] | add_dims (batch_size) |
out[0] | add_dims (num_classes) |
SetDoc (R"DOC( Combined Softmax and Cross-Entropy loss operator. The operator computes the softmax normalized values for each layer in the batch of the given input, after which cross-entropy loss is computed. This operator is numerically more stable than separate Softmax and CrossEntropy ops. The inputs are a 2-D tensor (Tensor<float>) of size (batch_size x input_feature_dimensions) and tensor of labels (ground truth). Output is tensor with the probability for each label for each example (N x D) and averaged loss (scalar). Use parameter label_prob=1 to enable inputting labels as a probability distribution. Optional third input blob can be used to weight the samples for the loss. )DOC").Input(0 | |
Unscaled log probabilities | Input (1,"labels","Ground truth").Input(2 |
Unscaled log probabilities Optional blob to be used to weight the samples for the loss | Output (0,"softmax","Tensor with softmax cross entropy loss").Output(1 |
OPERATOR_SCHEMA (SoftmaxWithLossGradient).NumOutputs(1) | |
REGISTER_CPU_OPERATOR (Softplus, SoftplusOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (SoftplusGradient, SoftplusGradientOp< float, CPUContext >) | |
is applied to the tensor elementwise DOC input tensor | InheritOnnxSchema ("Softplus") |
REGISTER_GRADIENT (Softplus, GetSoftplusGradient) | |
REGISTER_CPU_OPERATOR (Softsign, UnaryElementwiseOp< TensorTypes< float >, CPUContext, SoftsignCPUFunctor >) | |
REGISTER_CPU_OPERATOR (SoftsignGradient, BinaryElementwiseOp< TensorTypes< float >, CPUContext, WithoutBroadcast< SoftsignGradientCPUFunctor >>) | |
by providing the same input and output blobs DOC | Input (0,"input","1-D input tensor").Output(0 |
by providing the same input and output blobs DOC The | softsign (x/1+|x|) values of the input tensor" "computed element-wise") .InheritOnnxSchema("Softsign") |
SetDoc (R"DOC( Calculates the softsign gradient (sgn(x)/(1+|x|)^2) of the given input tensor element-wise. )DOC").Input(0 | |
D input tensor | Input (1,"input","1-D input tensor").Output(0 |
D input tensor The softsign | gradient (sgn(x)/(1+|x|)^2) values of the input tensor" "computed element-wise") |
REGISTER_GRADIENT (Softsign, GetSoftsignGradient) | |
REGISTER_CPU_OPERATOR (SpaceToBatch, SpaceToBatchOp< CPUContext >) | |
OPERATOR_SCHEMA (SpaceToBatch).NumInputs(1).NumOutputs(1).SetDoc(R"DOC( SpaceToBatch for 4-D tensors of type T. Zero-pads and then rearranges (permutes) blocks of spatial data into batch. More specifically | |
REGISTER_CPU_OPERATOR (BatchToSpace, BatchToSpaceOp< CPUContext >) | |
OPERATOR_SCHEMA (BatchToSpace).NumInputs(1).NumOutputs(1).SetDoc(R"DOC( BatchToSpace for 4-D tensors of type T. Rearranges (permutes) data from batch into blocks of spatial data | |
REGISTER_GRADIENT (SpaceToBatch, GetSpaceToBatchGradient) | |
REGISTER_GRADIENT (BatchToSpace, GetBatchToSpaceGradient) | |
template<typename Context > | |
void | spaceToBatch (const Tensor< Context > &input, int pad_t, int pad_l, int block_size, Tensor< Context > *output, Context *) |
template<typename Context > | |
void | batchToSpace (const Tensor< Context > &input, int pad_t, int pad_l, int block_size, Tensor< Context > *output, Context *) |
REGISTER_CPU_OPERATOR (SparseNormalize, SparseNormalizeOp< float, CPUContext >) | |
Parameters to be normalized | Input (1,"indices","Sparse indices").Input(2 |
Parameters to be normalized Gradient computed | Output (0,"output_param","Normalized parameters").EnforceOneToOneInplace().Arg("use_max_norm" |
Parameters to be normalized Gradient computed A bool variable to control whether to use max norm or constant norm When constant norm is used so that all the embedding vectors are scaled to have a L2 norm equals to | A (see blow arugment norm=A).If use_max_norm |
Parameters to be normalized Gradient computed A bool variable to control whether to use max norm or constant norm When constant norm is used so that all the embedding vectors are scaled to have a L2 norm equals to max norm is used so that embedding is scaled so that its l2 norm is no larger than A If an embedding s norm is less than A the embedding is left unchanged The default is True | Arg ("norm","L2 norm of the embedding. The default is 1.0.").SetDoc(R"DOC( Given a sparse matrix |
SHOULD_NOT_DO_GRADIENT (SparseNormalize) | |
REGISTER_CPU_OPERATOR (SparseToDense, SparseToDenseOp< CPUContext >) | |
value represented as indices vector and values tensor into a compacted tensor where the first dimension is determined by the first dimension of the input if it is given or the max index Missing values are filled with zeros The op supports duplicated indices and performs summation over corresponding values This behavior is useful for converting GradientSlices into dense representation After running this | len (mask)]+shape(default_value)`(if`lengths`is not provided the" "first dimension is omitted)") |
REGISTER_CPU_OPERATOR (SpatialBNGradient, SpatialBNGradientOp< CPUContext >) | |
NumInputs ({5, 7}).NumOutputs(3).AllowInplace( | |
REGISTER_GRADIENT (SpatialBN, GetSpatialBNGradient) | |
REGISTER_CPU_OPERATOR (SpatialBN, SpatialBNOp< CPUContext >) | |
REGISTER_CUDA_OPERATOR (SpatialBN, CudnnSpatialBNOp) | |
REGISTER_CUDA_OPERATOR (SpatialBNGradient, CudnnSpatialBNGradientOp) | |
REGISTER_CUDNN_OPERATOR (SpatialBN, CudnnSpatialBNOp) | |
REGISTER_CUDNN_OPERATOR (SpatialBNGradient, CudnnSpatialBNGradientOp) | |
REGISTER_CPU_OPERATOR (SpatialSoftmaxWithLoss, SpatialSoftmaxWithLossOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (SpatialSoftmaxWithLossGradient, SpatialSoftmaxWithLossGradientOp< float, CPUContext >) | |
CAFFE_ENFORCE_EQ (logits.dims_size(), 4) | |
CAFFE_ENFORCE_EQ (labels.dims_size(), 3) | |
SetDoc (R"DOC( Combined Spatial Softmax and Cross-Entropy loss operator. Similar to SoftmaxWithLoss, this operator computes the spatial softmax normalized values for each layer in the batch of the given input, after which cross-entropy loss is computed. This operator is numerically more stable than separate Softmax and CrossEntropy ops. The inputs are a 2-D tensor (Tensor<float>) of size (batch_size x input_feature_dimensions) and tensor of labels (ground truth). Output is tensor with the probability for each label in a pixel for each example (N x D x W x H) and averaged loss (scalar). For spatial softmax, weighting is by x,y position of the input. )DOC").Input(0 | |
OPERATOR_SCHEMA (SpatialSoftmaxWithLossGradient).NumOutputs(1) | |
REGISTER_CPU_OPERATOR (Sqrt, UnaryElementwiseOp< TensorTypes< float >, CPUContext, SqrtCPUFunctor >) | |
REGISTER_GRADIENT (Sqrt, GetSqrtGradient) | |
REGISTER_CPU_OPERATOR (SquareRootDivide, SquareRootDivideOp< CPUContext >) | |
SetDoc (R"DOC( Given DATA tensor with first dimension N and SCALE vector of the same size N produces an output tensor with same dimensions as DATA. Which consists of DATA slices. i-th slice is divided by sqrt(SCALE[i]) elementwise. If SCALE[i] == 0 output slice is identical to the input one (no scaling) Example: Data = [ [2.0, 4.0], [9.0, 12.0] ] SCALE = [4, 9] OUTPUT = [ [1.0, 2.0], [3.0, 4.0] ] )DOC") | |
REGISTER_GRADIENT (SquareRootDivide, GetSquareRootDivideGradient) | |
REGISTER_CPU_OPERATOR (StatRegistryCreate, StatRegistryCreateOp) | |
REGISTER_CPU_OPERATOR (StatRegistryUpdate, StatRegistryUpdateOp) | |
REGISTER_CPU_OPERATOR (StatRegistryExport, StatRegistryExportOp) | |
REGISTER_CPU_OPERATOR (TimerBegin, TimerBeginOp) | |
REGISTER_CPU_OPERATOR (TimerEnd, TimerEndOp) | |
REGISTER_CPU_OPERATOR (TimerGetAndEnd, TimerGetAndEndOp) | |
REGISTER_CPU_OPERATOR (TimerGet, TimerGetOp) | |
REGISTER_CPU_OPERATOR (CpuUtilizationReport, CpuUtilizationReportOp) | |
or the global with the values of counters for the given keys DOC | Input (0,"keys","1D string tensor with the key names to update.").Input(1 |
or the global with the values of counters for the given keys DOC int64 tensor with the values to update | Input (2,"handle","If provided, update the given StatRegistry. ""Otherwise, update the global singleton.") |
If export values from given StatRegistry export values from the global singleton StatRegistry | Output (0,"keys","1D string tensor with exported key names").Output(1 |
If export values from given StatRegistry export values from the global singleton StatRegistry int64 tensor with exported values | Output (2,"timestamps","The unix timestamp at counter retrieval.").Arg("reset" |
returning a pointer to it The timer is stopped by calling TimerEnd DOC | Arg ("counter_name","Name of the timer. If not provided, use output name.").Output(0 |
publishing a CAFFE_EVENT | Input (0,"timer","Pointer to timer, obtained from TimerBegin.") |
Pointer to obtained from TimerBegin | Output (0,"nanos","nanoseconds in int64") |
Delta in max CPU utilization in percentage as a float value | Arg ("stats_name","String name of the stat entry holding CPU utilization") |
CAFFE_KNOWN_TYPE (TimerInstance *) | |
CAFFE_KNOWN_TYPE (std::unique_ptr< caffe2::StatRegistry >) | |
REGISTER_CPU_OPERATOR (StopGradient, StopGradientOp< CPUContext >) | |
NumInputs (1, 1).NumOutputs(1 | |
NO_GRADIENT (StopGradient) | |
REGISTER_CUDA_OPERATOR (StopGradient, StopGradientOp< CUDAContext >) | |
REGISTER_CPU_OPERATOR (StumpFunc, StumpFuncOp< float, float, CPUContext >) | |
tensor of float | Output (0,"Y","tensor of float").SetDoc(R"DOC( Converts each input element into either high_ or low_value based on the given threshold. )DOC") |
NO_GRADIENT (StumpFunc) | |
REGISTER_CPU_OPERATOR (Summarize, SummarizeOp< float, CPUContext >) | |
SetDoc (R"DOC( Summarize computes four statistics of the input tensor (Tensor<float>)- min, max, mean and standard deviation. The output will be written to a 1-D tensor of size 4 if an output tensor is provided. Else, if the argument 'to_file' is greater than 0, the values are written to a log file in the root folder. )DOC").Arg("to_file" | |
default flag to indicate if the summarized statistics have to be written to a log file | Input (0,"data","The input data as Tensor<float>.").Output(0 |
SHOULD_NOT_DO_GRADIENT (Summarize) | |
REGISTER_CPU_OPERATOR (Swish, UnaryElementwiseOp< TensorTypes< float, double >, CPUContext, SwishCPUFunctor >) | |
REGISTER_CPU_OPERATOR (SwishGradient, SwishGradientOp< CPUContext >) | |
SetDoc (R"DOC( SwishGradient takes X, Y and dY and uses this to update dX according to the chain rule and derivatives of the swish function. )DOC") | |
REGISTER_GRADIENT (Swish, GetSwishGradient) | |
REGISTER_CPU_OPERATOR (Tanh, UnaryElementwiseOp< TensorTypes< float >, CPUContext, TanhCPUFunctor >) | |
REGISTER_CPU_OPERATOR (TanhGradient, BinaryElementwiseOp< TensorTypes< float >, CPUContext, WithoutBroadcast< TanhGradientCPUFunctor >>) | |
by providing the same input and output blobs DOC The hyperbolic tangent values of the input tensor computed element wise | InheritOnnxSchema ("Tanh") |
OPERATOR_SCHEMA (TanhGradient).NumInputs(2).NumOutputs(1).AllowInplace( | |
REGISTER_GRADIENT (Tanh, GetTanhGradient) | |
REGISTER_CPU_OPERATOR (TensorProtosDBInput, TensorProtosDBInput< CPUContext >) | |
INT_MAX | SetDoc (R"DOC( TensorProtosDBInput is a simple input operator that basically reads things from a db where each key-value pair stores an index as key, and a TensorProtos object as value. These TensorProtos objects should have the same size, and they will be grouped into batches of the given size. The DB Reader is provided as input to the operator and it returns as many output tensors as the size of the TensorProtos object. Each output will simply be a tensor containing a batch of data with size specified by the 'batch_size' argument containing data from the corresponding index in the TensorProtos objects in the DB. )DOC").Arg("batch_size" |
INT_MAX default the number of samples in a batch The default value of means that the | operator will attempt to insert the" "entire data in a single output blob.") .Input (0,"data","A pre-initialized DB reader. Typically, this is obtained ""by calling CreateDB operator with a db_name and a db_type. The ""resulting output blob is a DB Reader tensor").Output(0 |
INT_MAX default the number of samples in a batch The default value of means that the The output tensor in which the batches of data are returned The number of output tensors is equal to the size | of (number of TensorProto's in) the TensorProtos objects stored in the" "DB as values.Each output tensor will be of size specified by the" "'batch_size'argument of the operator") |
NO_GRADIENT (TensorProtosDBInput) | |
REGISTER_CUDA_OPERATOR (TensorProtosDBInput, TensorProtosDBInput< CUDAContext >) | |
void | convert (TensorProto_DataType dst_type, const char *src_start, const char *src_end, void *dst) |
CAFFE_KNOWN_TYPE (std::unique_ptr< TextFileReaderInstance >) | |
REGISTER_CPU_OPERATOR (CreateTextFileReader, CreateTextFileReaderOp) | |
REGISTER_CPU_OPERATOR (TextFileReaderRead, TextFileReaderReadOp) | |
Path to the file | Arg ("num_passes","Number of passes over the file.").Arg("field_types" |
Path to the file List with type of each field Type enum is found at core DataType | Output (0,"handler","Pointer to the created TextFileReaderInstance.") |
INT_MAX | SetDoc ("Read a batch of rows from the given text file reader instance. ""Expects the number of fields to be equal to the number of outputs. ""Each output is a 1D tensor containing the values for the given field ""for each row. When end of file is reached, returns empty tensors.").Input(0 |
INT_MAX Pointer to an existing TextFileReaderInstance | Arg ("batch_size","Maximum number of rows to read.") |
NO_GRADIENT (CreateTextFileReader) | |
NO_GRADIENT (TextFileReaderRead) | |
REGISTER_CPU_OPERATOR (ThresholdedRelu, ThresholdedReluOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (ThresholdedReluGradient, ThresholdedReluGradientOp< float, CPUContext >) | |
CostInferenceFunction (PointwiseCostInference< 2 >).IdenticalTypeAndShape().SetDoc(R"DOC( ThresholdedRelu takes one input data (Tensor) and produces one output data (Tensor) where the rectified linear function | |
REGISTER_CPU_OPERATOR (Tile, TileOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (TileGradient, TileGradientOp< float, CPUContext >) | |
if (in.size() > 1) | |
out[0] | set_dims (canonical_axis, out[0].dims().Get(canonical_axis)*tiles) |
SetDoc (R"DOC( Constructs a tensor by tiling a given tensor along a specified axis. This operation creates a new tensor by replicating the input tensor 'tiles' times along dimension 'axis'. The output tensor's 'axis'th dimension has input.dims(axis) * tiles elements, and the values of input are replicated 'tiles' times along the 'axis'th dimension. For example, tiling [[a b c d]] by tile=2, axis=0 produces [[a b c d], [a b c d]]. )DOC").Arg("tiles" | |
Number of replicas | Arg ("axis","Axis to replicate along").Input(0 |
Number of replicas The input tensor | Input (1,"tiles","(optional) Number of replicas (overrides argument)").Input(2 |
Number of replicas The input tensor optional Axis to replicate | along (overrides argument)") .Output( 0 |
Number of replicas The input tensor optional Axis to replicate Tensor that will contain input replicated along the given axis | InheritOnnxSchema ("Tile") |
OPERATOR_SCHEMA (TileGradient).NumInputs(1 | |
REGISTER_GRADIENT (Tile, GetTileGradient) | |
REGISTER_CPU_OPERATOR (TopK, TopKOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (TopKGradient, TopKGradientOp< float, CPUContext >) | |
TensorInferenceFunction ([](const OperatorDef &def, const vector< TensorShape > &in){vector< TensorShape > out={in[0], in[0]};ArgumentHelper helper(def);auto k=helper.GetSingleArgument("k",-1);auto dims_size=in[0].dims_size();out[0].set_dims(dims_size-1, k);out[1].set_dims(dims_size-1, k);out[1].set_data_type(TensorProto_DataType_INT32);if(def.output_size() > 2){TensorShape flatten_indices_shape;flatten_indices_shape.set_data_type(TensorProto_DataType_INT32);flatten_indices_shape.add_dims(std::accumulate(in[0].dims().begin(), in[0].dims().end()-1, 1, std::multiplies< long >())*k);out.push_back(flatten_indices_shape);}return out;}).SetDoc(R"DOC( Retrieve the top-K elements for the last dimension. Given an input tensor of shape [a_1 | |
r and integer argument return two k which contains the values of the top k elements along the last dimension Index tensor of shape[a_1, a_2,..., a_n, k] which contains the indices of the top k | elements (original indices from the input tensor).Given two equivalent values |
r and integer argument return two k which contains the values of the top k elements along the last dimension Index tensor of shape[a_1, a_2,..., a_n, k] which contains the indices of the top k this | operator uses the indices along the last dim-ension as a tiebreaker.That is, the element with the lower index will appear first.) DOC") .Input (0,"X","Tensor of shape [a_1, a_2, ..., a_n, r]").Output(0 |
r and integer argument return two k which contains the values of the top k elements along the last dimension Index tensor of shape[a_1, a_2,..., a_n, k] which contains the indices of the top k this Tensor of shape[a_1, a_2,..., a_n, k] containing top K values from the input tensor | Output (1,"Indices","Tensor of shape [a_1, a_2, ..., a_n, k] containing"" the corresponding input tensor indices for the top K values.").Output(2 |
r and integer argument return two k which contains the values of the top k elements along the last dimension Index tensor of shape[a_1, a_2,..., a_n, k] which contains the indices of the top k this Tensor of shape[a_1, a_2,..., a_n, k] containing top K values from the input tensor Flatten Tensor of shape[a_1 *a_2 *...*a_n *k] containing the indices into the flatten input | Arg ("k","Number of top elements to retrieve") |
OPERATOR_SCHEMA (TopKGradient).NumInputs(3).NumOutputs(1) | |
REGISTER_GRADIENT (TopK, GetTopKGradient) | |
REGISTER_CPU_OPERATOR (Transpose, TransposeOp< CPUContext >) | |
if (axes.empty()) | |
CAFFE_ENFORCE (valid_axes,"Axes argument passed in had invalid values") | |
CAFFE_ENFORCE (axes.size()==tensor_size,"Axes argument passed in had the incorrect size") | |
for (auto axis=axes.begin();axis!=axes.end();++axis) | |
SetDoc (R"DOC( Transpose the input tensor similar to numpy.transpose. For example, when axes=(1, 0, 2), given an input tensor of shape (1, 2, 3), the output shape will be (2, 1, 3). )DOC").Arg("axes" | |
A list of integers By reverse the otherwise permute the axes according to the values given Transposed output | InheritOnnxSchema ("Transpose") |
REGISTER_GRADIENT (Transpose, GetTransposeGradient) | |
REGISTER_CUDNN_OPERATOR (Transpose, CuDNNTransposeOp) | |
REGISTER_CPU_OPERATOR (WallClockTime, WallClockTimeOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (Print, PrintOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (FlattenToVec, FlattenToVecOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (Alias, AliasOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (ResizeLike, ResizeLikeOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (SumInt, SumOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (WeightedSum, WeightedSumOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (WeightedSumGradient, WeightedSumGradientOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (ScatterWeightedSum, ScatterWeightedSumOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (ScatterAssign, ScatterAssignOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (EnsureCPUOutput, CopyOp< CPUContext, CPUContext, CPUContext >) | |
REGISTER_CPU_OPERATOR (CopyFromCPUInput, CopyOp< CPUContext, CPUContext, CPUContext >) | |
REGISTER_CPU_OPERATOR (CopyOnDeviceLike, CopyOnDeviceLikeOp< CPUContext, CPUContext, CPUContext >) | |
REGISTER_CPU_OPERATOR (Copy, CopyOp< CPUContext, CPUContext, CPUContext >) | |
REGISTER_CPU_OPERATOR (LengthsToShape, LengthsToShapeOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (HasElements, HasElementsOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (IsEmpty, IsEmptyOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (Gather, GatherOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (GatherRanges, GatherRangesOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (LengthsGather, LengthsGatherOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (Unique, UniqueOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (LengthsToSegmentIds, LengthsToSegmentIdsOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (LengthsToRanges, LengthsToRangesOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (SegmentIdsToLengths, SegmentIdsToLengthsOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (SegmentIdsToRanges, SegmentIdsToRangesOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (LengthsToWeights, LengthsToWeightsOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (EnsureDense, EnsureDenseOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (AccumulateHistogram, AccumulateHistogramOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (UnsafeCoalesce, UnsafeCoalesceOp< CPUContext >) | |
bool saves contents to the root folder of the current appending the tensor contents to a file named after the blob name logs to stderr | Input (0,"tensor","The tensor to print.") |
OPERATOR_SCHEMA (LengthsToShape).NumInputs(1).NumOutputs(1) | |
out[0] | add_dims (total) |
SetDoc (R"DOC( Flattens the input tensor into a 1D vector. )DOC").Input(0 | |
out | push_back (in[1]) |
SetDoc (R"DOC( Produces tensor containing data of first input and shape of second input. )DOC").Input(0 | |
Tensor whose data will be copied into the output | Input (1,"shape_tensor","Tensor whose shape will be applied to output.").Output(0 |
NumInputs ([](int n){return(n > 0 &&n%2==0);}).NumOutputs(1).AllowInplace( | |
weight tensor pairs Input should be in the form where X_i all have the same and weight_i are size tensors that specifies the weight of each vector Note that if one wants to do in place it could only be done with X_0 also as the but not other X_i DOC | Input (0,"data_0","First of the input tensors.").Input(0 |
weight tensor pairs Input should be in the form where X_i all have the same and weight_i are size tensors that specifies the weight of each vector Note that if one wants to do in place it could only be done with X_0 also as the but not other X_i DOC Weight of the first input in the sum | Output (0,"output","Result containing weighted elem-wise sum of inputs.") |
NumInputs ([](int n){return(n > 0 &&n%2==1);}).NumOutputs(1 | |
NumInputs ([](int n){return(n > 3 &&(n-3)%2==0);}).NumOutputs(1).EnforceInplace( | |
SetDoc (R"DOC( Similar to WeightedSum, computes the weighted sum of several tensors, with the difference that inputs are sliced tensors. The first tensor has to be in-place and only slices of it on the first dimension as indexed by INDICES will be updated. Note: The op pretty much ignores the exact shapes of the input arguments and cares only about sizes. It's done for performance consideration to avoid unnecessary reshapes. Only first dimension of X_0 is important, let's call it N. If M is the total size of X_0 and K is the size of INDICES then X_i is assumed to be of shape K x (M / N) regardless of the real shape. Note: Each update in INDICES is applied independently which means that if duplicated elements are present in INDICES the corresponding slice of X_0 will be scaled multiple times. Manual collapsing of INDICES is required beforehand if necessary. Note: Updates are applied sequentially by inputs which might have undesired consequences if the input tensor is accessed concurrently by different op (e.g. when doing Hogwild). Other threads might see intermediate results even on individual slice level, e.g. X_0 scaled by weight_0 but without any updates applied. Currently only works on CPU because of access to INDICES. )DOC").Input(0 | |
Tensor to be updated | Input (1,"Weight_0","Scalar weight for X_0, applied only to slices affected.").Input(2 |
Tensor to be updated D list of indices on the first dimension of X_0 that need to be updated | Input (3,"X_1","Update slices, with shape len(INDICES) + shape(X_0)[1:]").Input(4 |
Tensor to be updated D list of indices on the first dimension of X_0 that need to be updated Scalar weight for X_1 update | Output (0,"X_0","Has to be exactly the same tensor as the input 0").EnforceInplace( |
SetDoc (R"DOC( Update slices of the tensor in-place by overriding current value. Note: The op pretty much ignores the exact shapes of the input arguments and cares only about sizes. It's done for performance consideration to avoid unnecessary reshapes. Only first dimension of X_0 is important, let's call it N. If M is the total size of X_0 and K is the size of INDICES then X_i is assumed to be of shape K x (M / N) regardless of the real shape. Note: Each update in INDICES is applied independently which means that if duplicated elements are present in INDICES arbitrary one will win. Currently only works on CPU because of access to INDICES. )DOC").Input(0 | |
Tensor to be updated | Input (1,"INDICES","1-D list of indices on the first dimension""of X_0 that need to be updated").Input(2 |
REGISTER_CUDA_OPERATOR (Print, PrintOp< CUDAContext >) | |
REGISTER_CUDA_OPERATOR (Flatten, FlattenOp< CUDAContext >) | |
REGISTER_CUDA_OPERATOR (FlattenToVec, FlattenToVecOp< CUDAContext >) | |
REGISTER_CUDA_OPERATOR (Alias, AliasOp< CUDAContext >) | |
REGISTER_CUDA_OPERATOR (ResizeLike, ResizeLikeOp< CUDAContext >) | |
REGISTER_CUDA_OPERATOR (Sum, SumOp< CUDAContext >) | |
REGISTER_CUDA_OPERATOR (WeightedSum, WeightedSumOp< CUDAContext >) | |
REGISTER_CUDA_OPERATOR (EnsureCPUOutput, CopyOp< CUDAContext, CPUContext, CUDAContext >) | |
REGISTER_CUDA_OPERATOR (CopyFromCPUInput, CopyOp< CUDAContext, CUDAContext, CPUContext >) | |
REGISTER_CUDA_OPERATOR (CopyGPUToCPU, CopyOp< CUDAContext, CPUContext, CUDAContext >) | |
REGISTER_CUDA_OPERATOR (CopyCPUToGPU, CopyOp< CUDAContext, CUDAContext, CPUContext >) | |
REGISTER_CUDA_OPERATOR (Copy, CopyOp< CUDAContext, CUDAContext, CUDAContext >) | |
REGISTER_CUDA_OPERATOR (CopyOnDeviceLike, CopyOnDeviceLikeOp< CUDAContext, CUDAContext, CUDAContext >) | |
REGISTER_CUDA_OPERATOR (UnsafeCoalesce, UnsafeCoalesceOp< CUDAContext >) | |
REGISTER_CPU_OPERATOR (VariableLengthSequencePadding, VariableLengthSequencePaddingOp< float, CPUContext >) | |
SetDoc (R"DOC( Super special-case operator. Used to pad a tensor to mimic pytorch's pad_packed_sequence. Given an input tensor INPUT of size NxBxM and an input tensor LENS of size B, where N = maximum sequence length B = batch size M = hidden size set each element of INPUT to zero if it is is past the end of the corresponding sequence (i.e. if LENS[j] > i for an index (i,j,k)). )DOC") | |
REGISTER_CPU_OPERATOR (WeightedMultiSampling, WeightedMultiSamplingOp< CPUContext >) | |
if (in[0].dims(0)==0) | |
if (args.HasArgument("num_samples")) | |
SetDoc (R"DOC( The operator performs sampling based on the input sampling weights. All weights are cummulative probability thus sorted. The output is a 1-D tensor (Tensor<int>). If two inputs are given, the second input is used to provide shape of the output sample tensor. Otherwise, we use argument `num_samples` to determine the number of samples to generate. )DOC").Input(0 | |
An optional D Tensor< float > Input cumulative sampling | probability (such as[0.2, 0.5, 0.8, 1.5])." "All weights must be non-negative numbers.Note that the last value of" "CDF is not necessary 1.If the last value is not 1 |
An optional D Tensor< float > Input cumulative sampling all values in sampling_cdf will be scaled by this number | Input (1,"shape_tensor (optional)","Tensor whose shape will be applied to output.").Output(0 |
An optional D Tensor< float > Input cumulative sampling all values in sampling_cdf will be scaled by this number The output tensor contains indices sampled from distribution given by the weight vector in the input tensor The output is a D Tensor< int > of size determined by argument num_samples or the second input tensor | Arg ("num_samples","number of samples to sample from the input data") |
SHOULD_NOT_DO_GRADIENT (WeightedMultiSample) | |
REGISTER_CPU_OPERATOR (WeightedSample, WeightedSampleOp< float, CPUContext >) | |
TensorInferenceFunction ([](const OperatorDef &def, const vector< TensorShape > &in){vector< TensorShape > out(2);int batch_size=in[0].dims(0);out[0]=CreateTensorShape(vector< int >{batch_size}, TensorProto::INT32);out[1]=CreateTensorShape(vector< int >{batch_size}, TensorProto::FLOAT);return out;}).SetDoc(R"DOC( The operator performs sampling based on the input sampling weights for each batch. All weights must be non-negative numbers. The input is a 2-D tensor (Tensor<float>) of size (batch_size x weights_dim). For each batch | |
an index is randomly sampled from the distribution given by the weights of the corresponding batch The output is a D | tensor (Tensor< int >) of size(batch_size x 1) and contains the index(es) of the sampled output.) DOC") .Input( 0 |
an index is randomly sampled from the distribution given by the weights of the corresponding batch The output is a D A D Tensor< float > of | size (batch_size x weights_dim)." "All weights must be non-negative numbers.") .Input( 1 |
an index is randomly sampled from the distribution given by the weights of the corresponding batch The output is a D A D Tensor< float > of An optional D Tensor< float > of The output tensor contains | index (es) sampled from distribution given" "by the weight vector(s) in the input tensor" "The output is a 1-D Tensor< int > of size(batch_size x 1)") .Output( 1 |
an index is randomly sampled from the distribution given by the weights of the corresponding batch The output is a D A D Tensor< float > of An optional D Tensor< float > of The output tensor contains The output tensor contains | value (s) selected by the sampled index(es)" "It is a 1-D Tensor< float > of size(batch_size x 1)") |
SHOULD_NOT_DO_GRADIENT (WeightedSample) | |
REGISTER_CPU_OPERATOR (While, WhileOp< CPUContext >) | |
INT_MAX | SetDoc (R"DOC( 'While' control operator, first input is a scalar boolean blob that stores loop's condition value. Accepts 'loop_net' (required) and 'cond_net' (optional) arguments for loop's body and condition subnets respectively. If condition subnet is specified, it is executed before the first and after each iteration. Subnets are executed in the same workspace as 'While'. )DOC").Arg("loop_net" |
INT_MAX Net executed on each iteration | Arg ("cond_net","Net to (re)compute condition value").Input(0 |
REGISTER_CUDA_OPERATOR (While, WhileOp< CUDAContext >) | |
REGISTER_CPU_OPERATOR (ZeroGradient, ZeroGradientOp< CPUContext >) | |
REGISTER_GRADIENT (ZeroGradient, GetZeroGradientOpGradient) | |
REGISTER_CUDA_OPERATOR (ZeroGradient, ZeroGradientOp< CUDAContext >) | |
EMBEDDING_SPECIALIZATION (int32_t, float, float, false) | |
EMBEDDING_SPECIALIZATION (int64_t, float, float, false) | |
EMBEDDING_SPECIALIZATION (int32_t, float16, float, false) | |
EMBEDDING_SPECIALIZATION (int64_t, float16, float, false) | |
EMBEDDING_SPECIALIZATION (int32_t, uint8_t, float, false) | |
EMBEDDING_SPECIALIZATION (int64_t, uint8_t, float, false) | |
EMBEDDING_SPECIALIZATION (int32_t, float, float, true) | |
EMBEDDING_SPECIALIZATION (int64_t, float, float, true) | |
EMBEDDING_SPECIALIZATION (int32_t, float16, float, true) | |
EMBEDDING_SPECIALIZATION (int64_t, float16, float, true) | |
EMBEDDING_SPECIALIZATION (int32_t, uint8_t, float, true) | |
EMBEDDING_SPECIALIZATION (int64_t, uint8_t, float, true) | |
template<typename IndexType , typename InType , typename OutType , bool IS_WEIGHT_POSITIONAL = false> | |
void | EmbeddingLookup (const TIndex block_size, const TIndex output_size, const TIndex index_size, const TIndex data_size, const InType *input, const IndexType *indices, const int *lengths, const float *weights, const float *scale_bias, bool normalize_by_lengths, OutType *out) |
Embedding lookup with reduction. More... | |
void | EmbeddingLookup_int32_t_float_float_false__avx2_fma (const TIndex block_size, const TIndex output_size, const TIndex index_size, const TIndex data_size, const float *input, const int32_t *indices, const int *lengths, const float *weights, const float *scale_bias, bool normalize_by_lengths, float *out) |
void | EmbeddingLookup_int32_t_float_float_true__avx2_fma (const TIndex block_size, const TIndex output_size, const TIndex index_size, const TIndex data_size, const float *input, const int32_t *indices, const int *lengths, const float *weights, const float *scale_bias, bool normalize_by_lengths, float *out) |
void | EmbeddingLookup_int64_t_float_float_false__avx2_fma (const TIndex block_size, const TIndex output_size, const TIndex index_size, const TIndex data_size, const float *input, const int64_t *indices, const int *lengths, const float *weights, const float *scale_bias, bool normalize_by_lengths, float *out) |
void | EmbeddingLookup_int64_t_float_float_true__avx2_fma (const TIndex block_size, const TIndex output_size, const TIndex index_size, const TIndex data_size, const float *input, const int64_t *indices, const int *lengths, const float *weights, const float *scale_bias, bool normalize_by_lengths, float *out) |
void | EmbeddingLookup_int32_t_float16_float_false__avx2_fma (const TIndex block_size, const TIndex output_size, const TIndex index_size, const TIndex data_size, const float16 *input, const int32_t *indices, const int *lengths, const float *weights, const float *scale_bias, bool normalize_by_lengths, float *out) |
void | EmbeddingLookup_int32_t_float16_float_true__avx2_fma (const TIndex block_size, const TIndex output_size, const TIndex index_size, const TIndex data_size, const float16 *input, const int32_t *indices, const int *lengths, const float *weights, const float *scale_bias, bool normalize_by_lengths, float *out) |
void | EmbeddingLookup_int64_t_float16_float_false__avx2_fma (const TIndex block_size, const TIndex output_size, const TIndex index_size, const TIndex data_size, const float16 *input, const int64_t *indices, const int *lengths, const float *weights, const float *scale_bias, bool normalize_by_lengths, float *out) |
void | EmbeddingLookup_int64_t_float16_float_true__avx2_fma (const TIndex block_size, const TIndex output_size, const TIndex index_size, const TIndex data_size, const float16 *input, const int64_t *indices, const int *lengths, const float *weights, const float *scale_bias, bool normalize_by_lengths, float *out) |
void | EmbeddingLookup_int32_t_uint8_t_float_false__avx2_fma (const TIndex block_size, const TIndex output_size, const TIndex index_size, const TIndex data_size, const uint8_t *input, const int32_t *indices, const int *lengths, const float *weights, const float *scale_bias, bool normalize_by_lengths, float *out) |
void | EmbeddingLookup_int32_t_uint8_t_float_true__avx2_fma (const TIndex block_size, const TIndex output_size, const TIndex index_size, const TIndex data_size, const uint8_t *input, const int32_t *indices, const int *lengths, const float *weights, const float *scale_bias, bool normalize_by_lengths, float *out) |
void | EmbeddingLookup_int64_t_uint8_t_float_false__avx2_fma (const TIndex block_size, const TIndex output_size, const TIndex index_size, const TIndex data_size, const uint8_t *input, const int64_t *indices, const int *lengths, const float *weights, const float *scale_bias, bool normalize_by_lengths, float *out) |
void | EmbeddingLookup_int64_t_uint8_t_float_true__avx2_fma (const TIndex block_size, const TIndex output_size, const TIndex index_size, const TIndex data_size, const uint8_t *input, const int64_t *indices, const int *lengths, const float *weights, const float *scale_bias, bool normalize_by_lengths, float *out) |
void | Fused8BitRowwiseEmbeddingLookup_int32_t_float_float_false__avx2_fma (const TIndex block_size, const TIndex output_size, const TIndex index_size, const TIndex data_size, const float *input, const int32_t *indices, const int *lengths, const float *weights, bool normalize_by_lengths, float *out) |
void | Fused8BitRowwiseEmbeddingLookup_int32_t_float_float_true__avx2_fma (const TIndex block_size, const TIndex output_size, const TIndex index_size, const TIndex data_size, const float *input, const int32_t *indices, const int *lengths, const float *weights, bool normalize_by_lengths, float *out) |
void | Fused8BitRowwiseEmbeddingLookup_int64_t_float_float_false__avx2_fma (const TIndex block_size, const TIndex output_size, const TIndex index_size, const TIndex data_size, const float *input, const int64_t *indices, const int *lengths, const float *weights, bool normalize_by_lengths, float *out) |
void | Fused8BitRowwiseEmbeddingLookup_int64_t_float_float_true__avx2_fma (const TIndex block_size, const TIndex output_size, const TIndex index_size, const TIndex data_size, const float *input, const int64_t *indices, const int *lengths, const float *weights, bool normalize_by_lengths, float *out) |
void | Fused8BitRowwiseEmbeddingLookup_int32_t_float16_float_false__avx2_fma (const TIndex block_size, const TIndex output_size, const TIndex index_size, const TIndex data_size, const float16 *input, const int32_t *indices, const int *lengths, const float *weights, bool normalize_by_lengths, float *out) |
void | Fused8BitRowwiseEmbeddingLookup_int32_t_float16_float_true__avx2_fma (const TIndex block_size, const TIndex output_size, const TIndex index_size, const TIndex data_size, const float16 *input, const int32_t *indices, const int *lengths, const float *weights, bool normalize_by_lengths, float *out) |
void | Fused8BitRowwiseEmbeddingLookup_int64_t_float16_float_false__avx2_fma (const TIndex block_size, const TIndex output_size, const TIndex index_size, const TIndex data_size, const float16 *input, const int64_t *indices, const int *lengths, const float *weights, bool normalize_by_lengths, float *out) |
void | Fused8BitRowwiseEmbeddingLookup_int64_t_float16_float_true__avx2_fma (const TIndex block_size, const TIndex output_size, const TIndex index_size, const TIndex data_size, const float16 *input, const int64_t *indices, const int *lengths, const float *weights, bool normalize_by_lengths, float *out) |
void | Fused8BitRowwiseEmbeddingLookup_int32_t_uint8_t_float_false__avx2_fma (const TIndex block_size, const TIndex output_size, const TIndex index_size, const TIndex data_size, const uint8_t *input, const int32_t *indices, const int *lengths, const float *weights, bool normalize_by_lengths, float *out) |
void | Fused8BitRowwiseEmbeddingLookup_int32_t_uint8_t_float_true__avx2_fma (const TIndex block_size, const TIndex output_size, const TIndex index_size, const TIndex data_size, const uint8_t *input, const int32_t *indices, const int *lengths, const float *weights, bool normalize_by_lengths, float *out) |
void | Fused8BitRowwiseEmbeddingLookup_int64_t_uint8_t_float_false__avx2_fma (const TIndex block_size, const TIndex output_size, const TIndex index_size, const TIndex data_size, const uint8_t *input, const int64_t *indices, const int *lengths, const float *weights, bool normalize_by_lengths, float *out) |
void | Fused8BitRowwiseEmbeddingLookup_int64_t_uint8_t_float_true__avx2_fma (const TIndex block_size, const TIndex output_size, const TIndex index_size, const TIndex data_size, const uint8_t *input, const int64_t *indices, const int *lengths, const float *weights, bool normalize_by_lengths, float *out) |
FUSED_8BIT_ROWWISE_EMBEDDING_SPECIALIZATION (int32_t, uint8_t, float) | |
FUSED_8BIT_ROWWISE_EMBEDDING_SPECIALIZATION (int64_t, uint8_t, float) | |
template<typename IndexType , typename InType , typename OutType , bool IS_WEIGHT_POSITIONAL = false> | |
void | Fused8BitRowwiseEmbeddingLookup (const TIndex block_size, const TIndex output_size, const TIndex index_size, const TIndex data_size, const InType *input, const IndexType *indices, const int *lengths, const float *weights, bool normalize_by_lengths, OutType *out) |
Embedding lookup with reduction. More... | |
template<> | |
void | TypedAxpy< float, float > (int N, const float a, const float *x, float *y) |
void | TypedAxpy_float16_float__base (int N, const float a, const float16 *x, float *y) |
template<> | |
void | TypedAxpy< float16, float > (int N, const float a, const float16 *x, float *y) |
void | TypedAxpy_uint8_float__base (int N, const float a, const std::uint8_t *x, float *y) |
template<> | |
void | TypedAxpy< std::uint8_t, float > (int N, const float a, const std::uint8_t *x, float *y) |
template<typename IN , typename OUT > | |
void | TypedAxpy (int N, const OUT a, const IN *x, OUT *y) |
void | TypedAxpy_float16_float__avx_f16c (int N, const float a, const float16 *x, float *y) |
void | TypedAxpy_float16_float__avx2_fma (int N, const float a, const float16 *x, float *y) |
void | TypedAxpy_uint8_float__avx2_fma (int N, const float a, const std::uint8_t *x, float *y) |
CAFFE_KNOWN_TYPE (std::shared_ptr< BlobsQueue >) | |
REGISTER_CPU_OPERATOR (CreateBlobsQueue, CreateBlobsQueueOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (EnqueueBlobs, EnqueueBlobsOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (DequeueBlobs, DequeueBlobsOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (CloseBlobsQueue, CloseBlobsQueueOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (SafeEnqueueBlobs, SafeEnqueueBlobsOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (SafeDequeueBlobs, SafeDequeueBlobsOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (WeightedSampleDequeueBlobs, WeightedSampleDequeueBlobsOp< CPUContext >) | |
OPERATOR_SCHEMA (CreateBlobsQueue).NumInputs(0).NumOutputs(1) | |
NumInputsOutputs ([](int inputs, int outputs){return inputs >=2 &&outputs >=1 &&inputs==outputs+1;}).EnforceInplace([](int input | |
NumInputsOutputs ([](int inputs, int outputs){return inputs==1 &&outputs >=1;}).SetDoc(R"DOC( Dequeue the blobs from queue. )DOC").Arg("timeout_secs" | |
Timeout in The shared pointer for the BlobsQueue | Output (0,"blob","The blob to store the dequeued data") |
OPERATOR_SCHEMA (CloseBlobsQueue).NumInputs(1).NumOutputs(0) | |
NumInputsOutputs ([](int inputs, int outputs){return inputs >=2 &&outputs >=2 &&inputs==outputs;}).EnforceInplace([](int input | |
SetDoc (R"DOC( Enqueue the blobs into queue. When the queue is closed and full, the output status will be set to true which can be used as exit criteria for execution step. The 1st input is the queue and the last output is the status. The rest are data blobs. )DOC").Input(0 | |
NumInputsOutputs ([](int inputs, int outputs){return inputs==1 &&outputs >=2;}).SetDoc(R"DOC( Dequeue the blobs from queue. When the queue is closed and empty | |
the output status will be set to true which can be used as exit criteria for execution step The input is the queue and the last output is the status The rest are data blobs DOC | Arg ("num_records","(default 1) If > 1, multiple records will be dequeued and tensors ""for each column will be concatenated. This requires all tensors in ""the records to be at least 1D, and to have the same inner dimensions.").Input(0 |
INT_MAX | SetDoc (R"DOC( Dequeue the blobs from multiple queues. When one of queues is closed and empty, the output status will be set to true which can be used as exit criteria for execution step. The 1st input is the queue and the last output is the status. The rest are data blobs. )DOC").Arg("weights" |
INT_MAX Weights for sampling from multiple queues | Arg ("table_idx_blob","The index of the blob (among the output blob list) ""that will be used to store the index of the table chosen to read the ""current batch.") |
NO_GRADIENT (CreateBlobsQueue) | |
NO_GRADIENT (EnqueueBlobs) | |
NO_GRADIENT (DequeueBlobs) | |
NO_GRADIENT (CloseBlobsQueue) | |
NO_GRADIENT (SafeEnqueueBlobs) | |
NO_GRADIENT (SafeDequeueBlobs) | |
NO_GRADIENT (WeightedSampleDequeueBlobs) | |
REGISTER_CUDA_OPERATOR (CreateBlobsQueue, CreateBlobsQueueOp< CUDAContext >) | |
REGISTER_CUDA_OPERATOR (EnqueueBlobs, EnqueueBlobsOp< CUDAContext >) | |
REGISTER_CUDA_OPERATOR (DequeueBlobs, DequeueBlobsOp< CUDAContext >) | |
REGISTER_CUDA_OPERATOR (CloseBlobsQueue, CloseBlobsQueueOp< CUDAContext >) | |
REGISTER_CUDA_OPERATOR (SafeEnqueueBlobs, SafeEnqueueBlobsOp< CUDAContext >) | |
REGISTER_CUDA_OPERATOR (SafeDequeueBlobs, SafeDequeueBlobsOp< CUDAContext >) | |
CAFFE_KNOWN_TYPE (RebatchingQueuePtr) | |
REGISTER_CPU_OPERATOR (Adagrad, AdagradOp< float, CPUContext >) | |
SetDoc (R"DOC( Computes the AdaGrad update for an input gradient and accumulated history. Concretely, given inputs (param, grad, moment, learning_rate), computes new_moment = moment + square(grad) new_grad = learning_rate * grad / (sqrt(new_moment) + epsilon) new_param = param + new_grad and returns (new_param, new_moment). )DOC").Input(0 | |
Parameters to be updated | Input (1,"moment","Moment history").Input(2 |
Parameters to be updated Gradient computed | Input (3,"lr","learning rate").Output(0 |
Parameters to be updated Gradient computed Updated parameters | Output (1,"output_moment","Updated moment").Arg("epsilon" |
Parameters to be updated Gradient computed Updated parameters Default | Arg ("decay","Default 1. If it is in (0, 1), the gradient square sum ""is decayed by this factor.") |
REGISTER_CPU_OPERATOR (SparseAdagrad, SparseAdagradOp< float, CPUContext >) | |
runs the dense AdaGrad update | on (param, grad, moment[indices], lr) |
runs the dense AdaGrad update and | returns (new_param, new_moment) as in the dense case.) DOC") .Input(0 |
runs the dense AdaGrad update and Parameters to be updated Sparse indices | Input (3,"grad","Gradient computed").Input(4 |
runs the dense AdaGrad update and Parameters to be updated Sparse indices learning rate | Output (0,"output_param","Updated parameters").Output(1 |
runs the dense AdaGrad update and Parameters to be updated Sparse indices learning rate Updated moment | Arg ("epsilon","Default 1e-5") |
REGISTER_CPU_OPERATOR (RowWiseSparseAdagrad, RowWiseSparseAdagradOp< float, CPUContext >) | |
runs a modified sparse Adagrad update and | returns (new_param, new_momwnr) |
runs a modified sparse Adagrad update and where moment is a tensor with length equal to the number of rows in and the new moment is calculated by adding the average squared sum of gradients across each row Note that indices must also be a tensor indexing into the rows of param DOC | Input (0,"param","Parameters to be updated").Input(1 |
runs a modified sparse Adagrad update and where moment is a tensor with length equal to the number of rows in and the new moment is calculated by adding the average squared sum of gradients across each row Note that indices must also be a tensor indexing into the rows of param DOC Moment history | Input (2,"indices","Sparse indices").Input(3 |
runs a modified sparse Adagrad update and where moment is a tensor with length equal to the number of rows in and the new moment is calculated by adding the average squared sum of gradients across each row Note that indices must also be a tensor indexing into the rows of param DOC Moment history Gradient computed | Input (4,"lr","learning rate").Output(0 |
runs a modified sparse Adagrad update and where moment is a tensor with length equal to the number of rows in and the new moment is calculated by adding the average squared sum of gradients across each row Note that indices must also be a tensor indexing into the rows of param DOC Moment history Gradient computed Updated parameters | Output (1,"output_moment_1","Updated moment").Arg("epsilon" |
SHOULD_NOT_DO_GRADIENT (Adagrad) | |
SHOULD_NOT_DO_GRADIENT (SparseAdagrad) | |
SHOULD_NOT_DO_GRADIENT (RowWiseSparseAdagrad) | |
template<typename Context > | |
void | adagrad_update (int N, const float *w, const float *g, const float *h, float *nw, float *nh, float epsilon, float decay, const float *lr, Context *) |
REGISTER_CPU_OPERATOR (Adam, AdamOp< float, CPUContext >) | |
NumInputs (6).NumOutputs(3).AllowInplace( | |
SetDoc (R"DOC( Computes the Adam update (https://arxiv.org/abs/1412.6980) for an input gradient and momentum parameters. Concretely, given inputs (param, m1, m2, grad, lr, iters), t = iters + 1 corrected_local_rate = lr * sqrt(1 - power(beta2, t)) / (1 - power(beta1, t)) m1_o = (beta1 * m1) + (1 - beta1) * grad m2_o = (beta2 * m2) + (1 - beta2) * np.square(grad) grad_o = corrected_local_rate * m1_o / \ (sqrt(m2_o) + epsilon) param_o = param + grad_o and returns (param_o, m1_o, m2_o) )DOC").Input(0 | |
Parameters to be updated | Input (1,"moment_1","First moment history").Input(2 |
Parameters to be updated Second moment history learning rate | Input (5,"iter","iteration number").Output(0 |
Parameters to be updated Second moment history learning rate Updated parameters | Output (1,"output_moment_1","Updated first moment").Output(2 |
Parameters to be updated Second moment history learning rate Updated parameters Updated second moment | Arg ("beta1","Default 0.9").Arg("beta2" |
REGISTER_CPU_OPERATOR (SparseAdam, SparseAdamOp< float, CPUContext >) | |
SetDoc (R"DOC( Computes the Adam Update for the sparse case. Given inputs (param, moment1, moment2, indices, grad, lr, iter), runs the dense Adam on (param, moment1[indices], momemnt2[indices], lr, iter) and returns (new_param, new_moment1, new_moment2) as in dense case )DOC").Input(0 | |
Parameters to be updated Second moment history | Input (3,"indices","Sparse indices").Input(4 |
Parameters to be updated Second moment history Gradient computed | Input (5,"lr","learning rate").Input(6 |
Parameters to be updated Second moment history Gradient computed iteration number Updated first moment | Output (2,"output_moment_2","Updated second moment").Arg("beta1" |
Parameters to be updated Second moment history Gradient computed iteration number Updated first moment Default | Arg ("beta2","Default 0.999").Arg("epsilon" |
REGISTER_CPU_OPERATOR (RowWiseSparseAdam, RowWiseSparseAdamOp< float, CPUContext >) | |
SetDoc (R"DOC( Computes a modified Adam Update for the sparse case. Given inputs (param, moment1, moment2, indices, grad, lr, iter), runs the Adam update on (param, moment1[indices], moment2[indices], lr, iter) and returns (new_param, new_moment1, new_moment2), where moment2 is a 1D tensor with length equal to the number of rows in param: shape(moment2) == shape(param)[0]. Each element of moment2 is applied to an entire row of param, and the new moment2 values are calculated by averaging across the row. )DOC").Input(0 | |
SHOULD_NOT_DO_GRADIENT (Adam) | |
SHOULD_NOT_DO_GRADIENT (SparseAdam) | |
SHOULD_NOT_DO_GRADIENT (RowWiseSparseAdam) | |
template<typename Context > | |
void | adam_update (int N, const float *g, const float *m, const float *v, float *ng, float *nm, float *nv, float beta1, float beta2, float eps_hat, float correction, const float *lr, Context *) |
template<typename Context > | |
void | adam_compute (int N, const float *w, const float *g, const float *m, const float *v, float *nw, float *nm, float *nv, float beta1, float beta2, float eps_hat, float correction, const float *lr, Context *) |
REGISTER_CPU_OPERATOR (ClipTensorByScaling, ClipTensorByScalingOp< CPUContext >) | |
SetDoc (R"DOC( Clips the input tensor by scaling based on the input value and the threshold. The value is usually the (pre-computed) norm of the tensor. If the value is larger than the threshold, scaling would be performed in this way: tensor *= (threshold / value). An optional input called additional_threshold can be provided which will scale the original threshold before it is used. That is, the final threshold will become threshold * additional_threshold. This op could be used for gradient clipping. )DOC").Input(0 | |
Tensor of floats to be clipped | Input (1,"val","Value to be compared against the threshold").Input(2 |
Tensor of floats to be clipped An optional additonal threshold to scale the orignal threshold | Arg ("threshold","Threshold to determine whether to scale down the tensor").Output(0 |
SHOULD_NOT_DO_GRADIENT (ClipTensorByScaling) | |
template<class Context > | |
void | fp16_momentum_sgd_update (int N, const float16 *g, const float16 *m, float16 *ng, float16 *nm, const float *lr, float momentum, bool nesterov, float weight_decay, bool fp32_update, float16 *param, Context *) |
template<class Context > | |
void | fp32_momentum_sgd_update (int N, const float *g, const float *m, float *ng, float *nm, const float *lr, float momentum, bool nesterov, float weight_decay, float *param, Context *) |
template<class T > | |
T | sgn (const T x) |
template<typename T > | |
void | ftrl_compute (const T w, const T n, const T z, const T g, T &nw, T &nn, T &nz, const FtrlParams< T > ¶ms) |
template<typename Context , typename T > | |
void | ftrl_update (int N, const T *w, const T *nz, const T *g, T *new_w, T *new_nz, const FtrlParams< T > ¶ms, Context *) |
REGISTER_CPU_OPERATOR (Iter, IterOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (AtomicIter, AtomicIterOp< CPUContext >) | |
REGISTER_BLOB_SERIALIZER ((TypeMeta::Id< std::unique_ptr< std::mutex >>()), MutexSerializer) | |
REGISTER_BLOB_DESERIALIZER (std::unique_ptr< std::mutex >, MutexDeserializer) | |
SetDoc (R"DOC( Stores a singe integer, that gets incremented on each call to Run(). Useful for tracking the iteration count during SGD, for example. )DOC") | |
SetDoc (R"DOC( Similar to Iter, but takes a mutex as the first input to make sure that updates are carried out atomically. This can be used in e.g. Hogwild sgd algorithms. )DOC").Input(0 | |
The mutex used to do atomic increment | Input (1,"iter","The iter counter as an int64_t TensorCPU.") |
NO_GRADIENT (Iter) | |
NO_GRADIENT (AtomicIter) | |
void | IncrementIter (TensorCPU *output) |
REGISTER_CUDA_OPERATOR (Iter, IterOp< CUDAContext >) | |
REGISTER_CUDA_OPERATOR (AtomicIter, AtomicIterOp< CUDAContext >) | |
REGISTER_CPU_OPERATOR (Lars, LarsOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (LearningRate, LearningRateOp< float, CPUContext >) | |
REGISTER_CUDA_OPERATOR (LearningRate, LearningRateOp< float, CUDAContext >) | |
REGISTER_CPU_OPERATOR (MomentumSGD, MomentumSGDOp< float, CPUContext >) | |
TensorInferenceFunction ([](const OperatorDef &, const vector< TensorShape > &in){vector< TensorShape > out(2);out[0]=in[0];out[1]=in[1];return out;}).SetDoc(R"DOC( Computes a momentum SGD update for an input gradient and momentum parameters. Concretely | |
given | inputs (grad, m, lr) and parameters(momentum |
given adjusted_gradient m_new Output | is (grad, momentum) Note the difference to MomemtumSGDUpdate |
given adjusted_gradient m_new Output which actually performs the parameter | update (and is thus faster).) DOC") |
SHOULD_NOT_DO_GRADIENT (MomentumSGD) | |
REGISTER_CPU_OPERATOR (MomentumSGDUpdate, MomentumSGDUpdateOp< float, CPUContext >) | |
TensorInferenceFunction ([](const OperatorDef &, const vector< TensorShape > &in){vector< TensorShape > out(3);out[0]=in[0];out[1]=in[1];out[2]=in[3];return out;}).SetDoc(R"DOC( Performs a momentum SGD update for an input gradient and momentum parameters. Concretely | |
given | inputs (grad, m, lr, param) and arguments(momentum |
given param | return ((1+momentum)*m_new-momentum *m, m_new, param) Output is(grad |
SHOULD_NOT_DO_GRADIENT (MomentumSGDUpdate) | |
REGISTER_CPU_OPERATOR (SparseMomentumSGDUpdate, SparseMomentumSGDUpdateOp< float, CPUContext >) | |
EnforceInplace ({{1, 1},{3, 2}}).TensorInferenceFunction([](const OperatorDef & | |
SetDoc (R"DOC( Performs a momentum SGD update analogous to MomentumSGDUpdate, but using a GradientSlice and indices into the full param and momentum tables. Both param and momentum should be in-place (corresponding inputs and outputs should be the same blobs). )DOC").Input(0 | |
GradientSlice with gradients for updated indices | Input (1,"moment","Momentum blob, same shape as param.").Input(2 |
GradientSlice with gradients for updated indices Learning rate | Input (3,"param","Full parameter blob.").Input(4 |
GradientSlice with gradients for updated indices Learning rate | Indices (in first dimension of param) where updates are performed.") .Output(0 |
GradientSlice with gradients for updated indices Learning rate Adjusted gradient | Output (1,"output_moment","Updated momentum.").Output(2 |
GradientSlice with gradients for updated indices Learning rate Adjusted gradient Updated parameter | Arg ("momentum","Momentum hyperparameter.").Arg("nesterov" |
SHOULD_NOT_DO_GRADIENT (SparseMomentumSGDUpdate) | |
template<typename Context > | |
void | momentum_sgd_update (const int N, const float *g, const float *m, float *ng, float *nm, const float *lr, const float momentum, const bool nesterov, float *param, Context *) |
template<> | |
void | rmsprop_update< CPUContext > (int N, const float *g, const float *ms, const float *mom, float *ng, float *nms, float *nmom, float decay, float momentum, float epsilon, const float *lr, CPUContext *) |
REGISTER_CPU_OPERATOR (RmsProp, RmsPropOp< float, CPUContext >) | |
SetDoc (R"DOC( Computes the RMSProp update (http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf). Concretely, given inputs (grad, mean_squares, mom, lr), computes: mean_squares_o = mean_squares + (1 - decay) * (square(grad) - mean_squares) mom_o = momentum * mom + lr * grad / sqrt(epsilon + mean_squares_o) grad_o = mom_o Returns (grad_o, mean_squares_o, mom_o). )DOC") | |
SHOULD_NOT_DO_GRADIENT (RmsProp) | |
template<typename Context > | |
void | rmsprop_update (int N, const float *g, const float *ms, const float *mom, float *ng, float *nms, float *nmom, float decay, float momentum, float epsilon, const float *lr, Context *context) |
REGISTER_CPU_OPERATOR (YellowFin, YellowFinOp< float, CPUContext >) | |
NumInputs (10).NumOutputs(8).AllowInplace( | |
SetDoc (R"DOC( Computes the YellowFin update (https://arxiv.org/abs/1706.03471) and performs momentum SGD optimization step. lr and mu are not being shared between parameters. curv_win, g_avg, g2_avg and scalars_memory are just auxiliary memory for computing moving averages (see the publication). Takes arguments beta: coefficient for moving averages, curv_win_width: timeframe when average squared gradient is being stored, epsilon: for numerical purposes, nesterov and zero_debias for debias of moving average. )DOC").Input(0 | |
Parameters to be updated | Input (1,"moment","Momentum").Input(2 |
Parameters to be updated Learning rate | Input (3,"mu","Momentum coefficient").Input(4 |
Parameters to be updated Learning rate Memory for latest curvature ranges | Input (5,"g_avg","Moving average of gradient").Input(6 |
Parameters to be updated Learning rate Memory for latest curvature ranges Moving average of squared gradient | Input (7,"scalars_memory","Memory for stateful scalars").Input(8 |
Parameters to be updated Learning rate Memory for latest curvature ranges Moving average of squared gradient Gradient computed | Input (9,"iter","Iteration number").Output(0 |
Parameters to be updated Learning rate Memory for latest curvature ranges Moving average of squared gradient Gradient computed Parameters to be updated | Output (1,"output_moment","Momentum").Output(2 |
Parameters to be updated Learning rate Memory for latest curvature ranges Moving average of squared gradient Gradient computed Parameters to be updated Output learning rate | Output (3,"output_mu","Output momentum coefficient").Output(4 |
Parameters to be updated Learning rate Memory for latest curvature ranges Moving average of squared gradient Gradient computed Parameters to be updated Output learning rate Output memory for latest curvature ranges | Output (5,"output_g_avg","Output moving average of gradient").Output(6 |
Parameters to be updated Learning rate Memory for latest curvature ranges Moving average of squared gradient Gradient computed Parameters to be updated Output learning rate Output memory for latest curvature ranges Output moving average of squared gradient | Output (7,"output_scalars_memory","Output memory for stateful scalars").Arg("beta" |
Parameters to be updated Learning rate Memory for latest curvature ranges Moving average of squared gradient Gradient computed Parameters to be updated Output learning rate Output memory for latest curvature ranges Output moving average of squared gradient Default | Arg ("curv_win_width","Default 20").Arg("epsilon" |
Parameters to be updated Learning rate Memory for latest curvature ranges Moving average of squared gradient Gradient computed Parameters to be updated Output learning rate Output memory for latest curvature ranges Output moving average of squared gradient Default Default | Arg ("nesterov","Default false").Arg("zero_debias" |
SHOULD_NOT_DO_GRADIENT (YellowFin) | |
void | initNNPACK () |
REGISTER_CPU_OPERATOR_WITH_ENGINE (Conv, NNPACK, NNPACKConvOp) | |
REGISTER_CPU_OPERATOR (QuantDecompZstd, QuantDecompZstdOp) | |
INT_MAX | SetDoc (R"DOC( Decompress a set of tensors that are compressed using zstd. The data can be compressed using mutils.compress_data_list(), see quant_decomp_op_test.py for an example. The number of outputs depended on the input. )DOC").Input(0 |
INT_MAX Compressed data in | tensor (uint8_t) |
SHOULD_NOT_DO_GRADIENT (QuantDecompZstd) | |
bool | are_nodes_common (const Graph &g, int model_idx, int candidate_idx) |
REGISTER_TRANSFORM (CommonSubexpressionElimination, CommonSubexpressionEliminationTransform) | |
REGISTER_TRANSFORM (ConvToNNPack, ConvToNNPackTransform) | |
bool | compare_ops (const OperatorDef &p_op, const OperatorDef &g_op, bool arg_match) |
const CpuId & | GetCpuId () |
template<class Map , typename Key = typename Map::key_type, typename Value = typename Map::mapped_type> | |
Map::mapped_type | get_default (const Map &map, const Key &key, Value &&dflt) |
template<typename T , typename T2 > | |
__device__ T | mixed_mult (T data1, T2 data2) |
template<typename T , typename T2 > | |
__device__ T | mixed_add (T data1, T2 data2) |
template<typename TIN , typename TOUT > | |
__device__ void | mixed_store (TIN *data_in, TOUT *data_out) |
template<typename T > | |
__device__ void | mixed_store (T *data_in, T *data_out) |
void | MurmurHash3_x86_32 (const void *key, int len, uint32_t seed, void *out) |
void | MurmurHash3_x86_128 (const void *key, const int len, uint32_t seed, void *out) |
void | MurmurHash3_x64_128 (const void *key, const int len, const uint32_t seed, void *out) |
const ::std::string & | GetEmptyStringAlreadyInited () |
void | ShutdownProtobufLibrary () |
std::string | DeviceTypeName (const int32_t &d) |
bool | IsSameDevice (const DeviceOption &lhs, const DeviceOption &rhs) |
bool | ReadStringFromFile (const char *filename, string *str) |
bool | WriteStringToFile (const string &str, const char *filename) |
string | ProtoDebugString (const Message &proto) |
bool | ParseProtoFromLargeString (const string &str, Message *proto) |
bool | ReadProtoFromTextFile (const char *filename, Message *proto) |
void | WriteProtoToTextFile (const Message &proto, const char *filename) |
bool | ReadProtoFromBinaryFile (const char *filename, MessageLite *proto) |
void | WriteProtoToBinaryFile (const MessageLite &proto, const char *filename) |
bool | operator== (const NetDef &l, const NetDef &r) |
std::ostream & | operator<< (std::ostream &output, const NetDef &n) |
template<> | |
Argument | MakeArgument (const string &name, const MessageLite &value) |
bool | HasOutput (const OperatorDef &op, const std::string &output) |
bool | HasInput (const OperatorDef &op, const std::string &input) |
const Argument & | GetArgument (const OperatorDef &def, const string &name) |
bool | GetFlagArgument (const OperatorDef &def, const string &name, bool def_value) |
Argument * | GetMutableArgument (const string &name, const bool create_if_missing, OperatorDef *def) |
bool | ReadProtoFromBinaryFile (const string filename, MessageLite *proto) |
void | WriteProtoToBinaryFile (const MessageLite &proto, const string &filename) |
bool | ReadProtoFromTextFile (const string filename, Message *proto) |
void | WriteProtoToTextFile (const Message &proto, const string &filename) |
bool | ReadProtoFromFile (const char *filename, Message *proto) |
bool | ReadProtoFromFile (const string &filename, Message *proto) |
template<class IterableInputs = std::initializer_list<string>, class IterableOutputs = std::initializer_list<string>, class IterableArgs = std::initializer_list<Argument>> | |
OperatorDef | CreateOperatorDef (const string &type, const string &name, const IterableInputs &inputs, const IterableOutputs &outputs, const IterableArgs &args, const DeviceOption &device_option=DeviceOption(), const string &engine="") |
template<class IterableInputs = std::initializer_list<string>, class IterableOutputs = std::initializer_list<string>> | |
OperatorDef | CreateOperatorDef (const string &type, const string &name, const IterableInputs &inputs, const IterableOutputs &outputs, const DeviceOption &device_option=DeviceOption(), const string &engine="") |
template<typename T > | |
Argument | MakeArgument (const string &name, const T &value) |
template<typename T > | |
void | AddArgument (const string &name, const T &value, OperatorDef *def) |
bool | operator== (const DeviceOption &dl, const DeviceOption &dr) |
std::vector< std::string > | split (char separator, const std::string &string) |
size_t | editDistance (const std::string &s1, const std::string &s2, size_t max_distance) |
int32_t | editDistanceHelper (const char *s1, size_t s1_len, const char *s2, size_t s2_len, std::vector< size_t > ¤t, std::vector< size_t > &previous, std::vector< size_t > &previous1, size_t max_distance) |
int | Do256NOPs () |
template<typename T > | |
T | WaitForVariableChange (std::atomic< T > *var, T initial_value, std::condition_variable *cond, std::mutex *mutex) |
void | OpticalFlowExtractor (const cv::Mat &prev_gray, const cv::Mat &curr_gray, const int flow_alg_type, cv::Mat &flow) |
void | MergeOpticalFlow (cv::Mat &prev_flow, const cv::Mat &curr_flow) |
void | MultiFrameOpticalFlowExtractor (const std::vector< cv::Mat > &grays, const int optical_flow_alg_type, cv::Mat &flow) |
REGISTER_CPU_OPERATOR (VideoInput, VideoInputOp< CPUContext >) | |
TensorInferenceFunction ([](const OperatorDef &def, const vector< TensorShape > &){ArgumentHelper helper(def);int batch_size=helper.GetSingleArgument< int >("batch_size", 0);int clip_per_video=helper.GetSingleArgument< int >("clip_per_video", 1);int crop_height=helper.GetSingleArgument< int >("crop_height", helper.GetSingleArgument< int >("crop_size", 0));int crop_width=helper.GetSingleArgument< int >("crop_width", helper.GetSingleArgument< int >("crop_size", 0));int length_rgb=helper.GetSingleArgument< int >("length_rgb", 0);int channels_rgb=helper.GetSingleArgument< int >("channels_rgb", 3);int length_of=helper.GetSingleArgument< int >("length_of", 0);int channels_of=helper.GetSingleArgument< int >("channels_of", 2);bool get_rgb=helper.GetSingleArgument< bool >("get_rgb", true);bool get_optical_flow=helper.GetSingleArgument< bool >("get_optical_flow", false);bool do_multi_label=helper.GetSingleArgument< bool >("do_multi_label", false);bool get_video_id=helper.GetSingleArgument< bool >("get_video_id", false);int output_size=1;if(get_rgb){output_size++;}if(get_optical_flow){output_size++;}if(get_video_id){output_size++;}int index=0;vector< TensorShape > out(output_size);CHECK_GT(crop_height, 0);CHECK_GT(crop_width, 0);batch_size *=clip_per_video;if(get_rgb){out[index++]=CreateTensorShape(vector< int >{batch_size, channels_rgb, length_rgb, crop_height, crop_width}, TensorProto::FLOAT);}if(get_optical_flow){out[index++]=CreateTensorShape(vector< int >{batch_size, channels_of, length_of, crop_height, crop_width}, TensorProto::FLOAT);}if(!do_multi_label){out[index++]=CreateTensorShape(vector< int >{1, batch_size}, TensorProto::INT32);}else{int num_of_class=helper.GetSingleArgument< int >("num_of_class", 0);out[index++]=CreateTensorShape(vector< int >{batch_size, num_of_class}, TensorProto::INT32);}if(get_video_id){out[index]=CreateTensorShape(vector< int >{1, batch_size}, TensorProto::INT32);}return out;}) | |
NO_GRADIENT (VideoInput) | |
REGISTER_CUDA_OPERATOR (VideoInput, VideoInputOp< CUDAContext >) | |
void | Saturation (float *clip, const int length, const int crop_height, const int crop_width, const float alpha_rand, std::mt19937 *randgen) |
void | Brightness (float *clip, const int length, const int crop_height, const int crop_width, const float alpha_rand, std::mt19937 *randgen) |
void | Contrast (float *clip, const int length, const int crop_height, const int crop_width, const float alpha_rand, std::mt19937 *randgen) |
void | ColorJitter (float *clip, const int length, const int crop_height, const int crop_width, const float saturation, const float brightness, const float contrast, std::mt19937 *randgen) |
void | ColorLighting (float *clip, const int length, const int crop_height, const int crop_width, const float alpha_std, const std::vector< std::vector< float >> &eigvecs, const std::vector< float > &eigvals, std::mt19937 *randgen) |
void | ColorNormalization (float *clip, const int length, const int crop_height, const int crop_width, const int channels, const std::vector< float > &mean, const std::vector< float > &inv_std) |
void | ClipTransformRGB (const unsigned char *buffer_rgb, const int multi_crop_count, const int crop_height, const int crop_width, const int length_rgb, const int channels_rgb, const int sampling_rate_rgb, const int height, const int width, const int h_off, const int w_off, const int *multi_crop_h_off, const int *multi_crop_w_off, const bool mirror_me, const bool color_jitter, const float saturation, const float brightness, const float contrast, const bool color_lighting, const float color_lighting_std, const std::vector< std::vector< float >> &color_lighting_eigvecs, const std::vector< float > &color_lighting_eigvals, const std::vector< float > &mean_rgb, const std::vector< float > &inv_std_rgb, std::mt19937 *randgen, float *transformed_clip) |
void | ClipTransformOpticalFlow (const unsigned char *buffer_rgb, const int crop_height, const int crop_width, const int length_of, const int channels_of, const int sampling_rate_of, const int height, const int width, const cv::Rect &rect, const int channels_rgb, const bool mirror_me, const int flow_alg_type, const int flow_data_type, const int frame_gap_of, const bool do_flow_aggregation, const std::vector< float > &mean_of, const std::vector< float > &inv_std_of, float *transformed_clip) |
void | FreeDecodedData (std::vector< std::unique_ptr< DecodedFrame >> &sampledFrames) |
bool | DecodeMultipleClipsFromVideo (const char *video_buffer, const std::string &video_filename, const int encoded_size, const Params ¶ms, const int start_frm, const int clip_per_video, const bool use_local_file, int &height, int &width, std::vector< unsigned char * > &buffer_rgb) |
REGISTER_CPU_OPERATOR (AffineChannel, AffineChannelOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (AffineChannelGradient, AffineChannelGradientOp< float, CPUContext >) | |
SetDoc (R"DOC( Applies a separate affine transformation to each channel of the input. Useful for replacing spatial batch norm with its equivalent fixed transformation. )DOC").Input(0 | |
feature map input of input of | shape (C) |
the c th element is the scale factor of the affine transformation for the c th channel of the input | Input (2,"bias","1D input of shape (C); the c-th element is the bias of the affine ""transformation for the c-th channel of the input.").Output(0 |
Input (0,"scale","See AffineChannel.").Input(1 | |
REGISTER_GRADIENT (AffineChannel, GetAffineChannelGradient) | |
REGISTER_CPU_OPERATOR (BatchPermutation, BatchPermutationOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (BatchPermutationGradient, BatchPermutationGradientOp< float, CPUContext >) | |
NumInputs(2).NumOutputs(1).SetDoc(R"DOC( Permute the batch elements of the input tensor X according to the permutation specified in the input indices. Warning gradient comptuation is only correct if indices is a permutation DOC | Input (0,"X","Tensor of at least 1D shape (N, D0, D1, ...).").Input(1 |
NumInputs(2).NumOutputs(1).SetDoc(R"DOC( Permute the batch elements of the input tensor X according to the permutation specified in the input indices. Warning gradient comptuation is only correct if indices is a permutation DOC tensor of type int with | shape (N,) specifying a valid permutation" "of the indices in[0 |
NumInputs(2).NumOutputs(1).SetDoc(R"DOC( Permute the batch elements of the input tensor X according to the permutation specified in the input indices. Warning gradient comptuation is only correct if indices is a permutation DOC tensor of type int with | N (inclusive).") .Output( 0 |
NumInputs(2).NumOutputs(1).SetDoc(R"DOC( Permute the batch elements of the input tensor X according to the permutation specified in the input indices. Warning gradient comptuation is only correct if indices is a permutation DOC tensor of type int with Tensor with the same shape as X where | the (D0, D1,...) dimensional" "batch elements of X are permuted according to the input indices.") |
See BatchPermutation | Input (1,"dY","Gradient of forward output 0 (Y).").Output(0 |
REGISTER_GRADIENT (BatchPermutation, GetBatchPermutationGradient) | |
REGISTER_CPU_OPERATOR (GroupSpatialSoftmax, GroupSpatialSoftmaxOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (GroupSpatialSoftmaxGradient, GroupSpatialSoftmaxGradientOp< float, CPUContext >) | |
number of classes in each softmax group | Input (0,"scores","4D tensor of softmax inputs (called 'scores' or 'logits') with shape ""(N, C, H, W), where C = num_anchors * num_classes defines num_anchors ""groups of contiguous num_classes softmax inputs.").Output(0 |
See GroupSpatialSoftmax | Input (1,"d_probabilities","Gradient of forward output 0 (probabilities).").Output(0 |
See GroupSpatialSoftmax Gradient of forward | input (scores).") |
REGISTER_GRADIENT (GroupSpatialSoftmax, GetGroupSpatialSoftmaxGradient) | |
REGISTER_CPU_OPERATOR (PSRoIPool, PSRoIPoolOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (PSRoIPoolGradient, PSRoIPoolGradientOp< float, CPUContext >) | |
Spatial scale of the input feature map X relative to the input image E if X has a stride of w r t the input image | Arg ("group_size","(int) default 1; pooled_h = pooled_w = group_size where pooled_{h,w} ""is the pooled output Y's height and width, respectively.").Arg("output_dim" |
number of channels in the pooled which might be the number of classes is used for classification or if used for class agnostic bounding box regression | Input (0,"X","4D position sensitive feature map input of shape (N, C, H, W), where ""C = group_size**2 * output_dim.").Input(1 |
REGISTER_CPU_OPERATOR (RoIPoolF, RoIPoolFOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (RoIPoolFGradient, RoIPoolFGradientOp< float, CPUContext >) | |
Pooled output Y s width | Input (0,"X","4D feature map input of shape (N, C, H, W).").Input(1 |
REGISTER_CPU_OPERATOR (SampleAs, SampleAsOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (SampleAsGradient, SampleAsGradientOp< float, CPUContext >) | |
Tensor of at least | shape (N,...).") .Input( 1 |
See SampleAs | Input (1,"labels","See SampleAs.").Input(2 |
REGISTER_GRADIENT (SampleAs, GetSampleAsGradient) | |
REGISTER_CPU_OPERATOR (SelectSmoothL1Loss, SelectSmoothL1LossOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (SelectSmoothL1LossGradient, SelectSmoothL1LossGradientOp< float, CPUContext >) | |
L2 to L1 transition point | Arg ("scale","(float) default 1.0; multiply the loss by this scale factor.").Input(0 |
L2 to L1 transition point tensor of bounding box regression predictions with | shape (N, 4 *num_bbox_classes *num_anchors, H, W).") .Input( 1 |
L2 to L1 transition point tensor of bounding box regression predictions with tensor of labels | shape (M, 4) for 4 contiguous channels starting" "at each of the M locations selected by the locations input.") .Input( 2 |
L2 to L1 transition point tensor of bounding box regression predictions with tensor of labels tensor of shape(M, 4) that identifies M 'select'locations" "encoded by the four colums the loss is divided by | max (1, normalizer).") .Output( 0 |
See SelectSmoothL1Loss | Input (1,"Y","See SelectSmoothL1Loss.").Input(2 |
See SelectSmoothL1Loss See SelectSmoothL1Loss | Input (3,"normalizer","See SelectSmoothL1Loss.").Input(4 |
See SelectSmoothL1Loss See SelectSmoothL1Loss Gradient of forward | output (loss).") .Output( 0 |
See SelectSmoothL1Loss See SelectSmoothL1Loss Gradient of forward Gradient of forward | input (Y_hat).") |
REGISTER_GRADIENT (SelectSmoothL1Loss, GetSelectSmoothL1LossGradient) | |
REGISTER_CPU_OPERATOR (SigmoidCrossEntropyLoss, SigmoidCrossEntropyLossOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (SigmoidCrossEntropyLossGradient, SigmoidCrossEntropyLossGradientOp< float, CPUContext >) | |
multiply the loss by this scale factor | Arg ("normalize","(int) default 1; if true, divide the loss by the number of targets > ""-1.").Input(0 |
multiply the loss by this scale factor Tensor of predicted | logits (shape must be at least 1D).") .Input( 1 |
multiply the loss by this scale factor Tensor of predicted Tensor of targets of type int and same shape as logits X | Output (0,"loss","Scalar loss.") |
See SigmoidCrossEntropyLoss | Input (1,"targets","See SigmoidCrossEntropyLoss.").Input(2 |
REGISTER_GRADIENT (SigmoidCrossEntropyLoss, GetSigmoidCrossEntropyLossGradient) | |
REGISTER_CPU_OPERATOR (SigmoidFocalLoss, SigmoidFocalLossOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (SigmoidFocalLossGradient, SigmoidFocalLossGradientOp< float, CPUContext >) | |
where N is the number of elements in the H and W are the height and and each of length num_classes For the binary form of Focal num_classes does not include the background category(So, for COCO, num_classes=80, not 81.) The binary form of focal loss is multiply the loss by this scale factor | Arg ("alpha","(float) default 0.25; Focal Loss's alpha hyper-parameter.").Arg("gamma" |
Focal Loss s gamma hyper parameter | Arg ("num_classes","(int) default 80; number of classes (excluding background).").Input(0 |
Focal Loss s gamma hyper parameter tensor of sigmoid | inputs (called 'scores'or 'logits') with shape" "(N |
See SigmoidFocalLoss | Input (1,"labels","See SigmoidFocalLoss.").Input(2 |
See SigmoidFocalLoss See SigmoidFocalLoss | Input (3,"d_loss","Gradient of forward output 0 (loss)").Output(0 |
See SigmoidFocalLoss See SigmoidFocalLoss Gradient of forward | input (logits)") |
REGISTER_GRADIENT (SigmoidFocalLoss, GetSigmoidFocalLossGradient) | |
REGISTER_CPU_OPERATOR (SmoothL1Loss, SmoothL1LossOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (SmoothL1LossGradient, SmoothL1LossGradientOp< float, CPUContext >) | |
NumInputs(4).NumOutputs(1).SetDoc(R"DOC( Smooth L1 Loss is a minor variation of Huber loss in which the point of transition between L2 loss and L1 loss is adjustable by a hyper-parameter beta L2 to L1 transition point Tensor of | predictions (at least 1D).") .Input( 1 |
NumInputs(4).NumOutputs(1).SetDoc(R"DOC( Smooth L1 Loss is a minor variation of Huber loss in which the point of transition between L2 loss and L1 loss is adjustable by a hyper-parameter beta L2 to L1 transition point Tensor of Tensor of labels with the same shape as Y_hat | Input (2,"alpha_in","Tensor of inside weights with the same shape as Y.").Input(3 |
See SmoothL1Loss | Input (1,"Y","See SmoothL1Loss.").Input(2 |
See SmoothL1Loss See SmoothL1Loss | Input (3,"alpha_out","See SmoothL1Loss.").Input(4 |
REGISTER_GRADIENT (SmoothL1Loss, GetSmoothL1LossGradient) | |
REGISTER_CPU_OPERATOR (SoftmaxFocalLoss, SoftmaxFocalLossOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (SoftmaxFocalLossGradient, SoftmaxFocalLossGradientOp< float, CPUContext >) | |
where N is the number of elements in the H and W are the height and and where t is the | target (ground truth) class |
Focal Loss s gamma hyper parameter | Arg ("num_classes","(int) default 81; number of classes in each softmax group.").Input(0 |
the loss is normalized by Scalar loss | Output (1,"probabilities","4D tensor of softmax probabilities with shape (N, C, H, W), where ""C = num_anchors * num_classes, and softmax was applied to each of the ""num_anchors groups; within a group the num_classes values sum to 1.") |
See SoftmaxFocalLoss | Input (1,"labels","See SoftmaxFocalLoss.").Input(2 |
See SoftmaxFocalLoss See SoftmaxFocalLoss | Input (3,"probabilities","Output 1 from SoftmaxFocalLoss; See SoftmaxFocalLoss.").Input(4 |
REGISTER_GRADIENT (SoftmaxFocalLoss, GetSoftmaxFocalLossGradient) | |
REGISTER_CPU_OPERATOR (SpatialNarrowAs, SpatialNarrowAsOp< CPUContext >) | |
REGISTER_CPU_OPERATOR (SpatialNarrowAsGradient, SpatialNarrowAsGradientOp< CPUContext >) | |
or input of | shape (N, H0, W0) or(N |
or input of W0 | Input (1,"B","3D or 4D input of shape (N, H1, W1) or (N, C, H1, W1), where H1 <= H0 ""and W1 <= W0.").Output(0 |
or input of W0 Sub window of A containing | rows (inclusive) and columns" "[0 |
or input of W0 Sub window of A containing | W1 (inclusive).") |
See SpatialNarrowAs | Input (1,"B","See SpatialNarrowAs.").Input(2 |
See SpatialNarrowAs Gradient of forward | output (C).") .Output( 0 |
See SpatialNarrowAs Gradient of forward Gradient of forward | input (A)") |
REGISTER_GRADIENT (SpatialNarrowAs, SpatialNarrowAsGradient) | |
REGISTER_CPU_OPERATOR (UpsampleNearest, UpsampleNearestOp< float, CPUContext >) | |
REGISTER_CPU_OPERATOR (UpsampleNearestGradient, UpsampleNearestGradientOp< float, CPUContext >) | |
integer upsampling factor feature map of | shape (N, C, scale *H, scale *W) |
REGISTER_GRADIENT (UpsampleNearest, GetUpsampleNearestGradient) | |
REGISTER_CPU_OPERATOR (Caffe2ModuleTestDynamicDummy, Caffe2ModuleTestDynamicDummyOp) | |
OPERATOR_SCHEMA (Caffe2ModuleTestDynamicDummy) | |
REGISTER_CAFFE2_EARLY_INIT_FUNCTION (registerGlobalPerfNetObserverReporter,®isterGlobalPerfNetObserverReporter,"Caffe2 print net observer reporter") | |
REGISTER_CAFFE2_EARLY_INIT_FUNCTION (registerGlobalPerfNetObserverCreator,®isterGlobalPerfNetObserverCreator,"Caffe2 net global observer creator") | |
CAFFE2_MODULE (caffe2_rocksdb,"RocksDB implementation for caffe2::DB.") | |
Variables | |
DoRunWithOtherType2 typedef Registry< std::string, std::unique_ptr< OperatorBase >, const OperatorDef &, Workspace * > | OperatorRegistry |
const int | kCIFARSize = 32 |
const int | kCIFARImageNBytes = kCIFARSize * kCIFARSize * 3 |
const int | kCIFAR10BatchSize = 10000 |
const int | kCIFAR10TestDataSize = 10000 |
const int | kCIFAR10TrainBatches = 5 |
const int | kCIFAR100TrainDataSize = 50000 |
const int | kCIFAR100TestDataSize = 10000 |
constexpr size_t | gCaffe2Alignment = 32 |
constexpr auto | kTensorBlobType = "Tensor" |
constexpr auto | kChunkIdSeparator = "#%" |
constexpr int | kDefaultChunkSize = -1 |
constexpr int | kNoChunking = 0 |
std::atomic< bool > | g_caffe2_has_cuda_linked {false} |
constexpr int | CAFFE_CUDA_NUM_THREADS = 512 |
constexpr int | CAFFE_MAXIMUM_NUM_BLOCKS = 4096 |
constexpr int | MaxDeviceTypes = DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES |
std::function< void(const OperatorDef &)> | GetOperatorLogger () |
DoRunWithType2 | |
constexpr int | kCannotComputeNumOutputs = -1 |
constexpr auto | kQTensorBlobQType = "QTensor" |
constexpr int | k_limit_default_ = 1000 |
float16 | |
constexpr auto | kBlobName = "blob_name" |
constexpr auto | kAddValue = "add_value" |
alternative key for the | handler |
INT_MAX batch_size images will be processed GPUs can optionally be used for part of the processing The following transformations are applied to the image A bounding box is applied to the initial Number of images to output for each run of the Whether or not to do color jitter Defaults to Image brightness scale used in color jittering Defaults to Whether or not to do color lighting Defaults to | Type |
INT_MAX batch_size images will be processed GPUs can optionally be used for part of the processing The following transformations are applied to the image A bounding box is applied to the initial Number of images to output for each run of the Whether or not to do color jitter Defaults to Image brightness scale used in color jittering Defaults to Whether or not to do color lighting Defaults to Scale the size of the smallest dimension of the image to this Scale and minsize are mutually exclusive Must be larger than crop | If |
INT_MAX batch_size images will be processed GPUs can optionally be used for part of the processing The following transformations are applied to the image A bounding box is applied to the initial Number of images to output for each run of the Whether or not to do color jitter Defaults to Image brightness scale used in color jittering Defaults to Whether or not to do color lighting Defaults to Scale the size of the smallest dimension of the image to this Scale and minsize are mutually exclusive Must be larger than crop both dimensions of the image will be set to minsize or | scale |
otherwise | |
the other dimension is proportionally scaled Defaults to Whether or not to mirror the image Defaults to Vector of means per color Standard deviation by which to normalize color channels Defaults to Bounding box coordinate Defaults Bounding box coordinate Defaults Bounding box coordinate Defaults Bounding box coordinate Defaults if the input is in Caffe format Defaults to Number of CPU decode transform threads Defaults to Name of the Type of The sizes of any outputs besides the data and shortest side desired for image resize Defaults to[-1,-1] or no random resize desired | data |
the other dimension is proportionally scaled Defaults to Whether or not to mirror the image Defaults to Vector of means per color Standard deviation by which to normalize color channels Defaults to Bounding box coordinate Defaults Bounding box coordinate Defaults Bounding box coordinate Defaults Bounding box coordinate Defaults if the input is in Caffe format Defaults to Number of CPU decode transform threads Defaults to Name of the Type of The sizes of any outputs besides the data and shortest side desired for image resize Defaults to[-1,-1] or no random resize desired Tensor containing the images additional | outputs |
the other dimension is proportionally scaled Defaults to Whether or not to mirror the image Defaults to Vector of means per color Standard deviation by which to normalize color channels Defaults to Bounding box coordinate Defaults Bounding box coordinate Defaults Bounding box coordinate Defaults Bounding box coordinate Defaults if the input is in Caffe format Defaults to Number of CPU decode transform threads Defaults to Name of the Type of The sizes of any outputs besides the data and shortest side desired for image resize Defaults to[-1,-1] or no random resize desired Tensor containing the images additional Any outputs after the first will be Tensors read from the input | TensorProtos |
const char *const | snpe_ffi_so = "libsnpe_ffi.so" |
constexpr size_t | k2b1bXBits = 2 |
constexpr size_t | kL1CacheSizeBytes = 16 * 1024 |
constexpr size_t | kGEMMTileSize = 64 |
constexpr size_t | kGEMMTileDepthBytes = 16 |
element wise DOC | output |
we add to | it |
we first initialize the output tensor to all | zeros |
we first initialize the output tensor to all and then do accumulation Any further calls to the | input |
we first initialize the output tensor to all and then do accumulation Any further calls to the The input tensor that has to be accumulated to the output tensor If the output size is not the same as input | size = 1 |
we first initialize the output tensor to all and then do accumulation Any further calls to the The input tensor that has to be accumulated to the output tensor If the output size is not the same as input the output tensor is first reshaped and initialized to | zero |
we first initialize the output tensor to all and then do accumulation Any further calls to the The input tensor that has to be accumulated to the output tensor If the output size is not the same as input the output tensor is first reshaped and initialized to and only | then |
this | X = in[0] |
a_1 | |
ints | |
longs | |
or long longs and checks if all values are true when coerced into a boolean In other | words |
const vector< TensorShape > & | in |
ArgumentHelper | helper (def) |
vector< int > | output_dims = GetDimsVector(in[1]) |
const auto & | data_dims = GetDimsVector(in[0]) |
const auto & | indices_dims = GetDimsVector(in[1]) |
out [0] = CreateTensorShape(output_dims, TensorProto::FLOAT) | |
DATA | |
Tensor of rank | r |
indices | vector |
indices and values vector Each element in lengths indices should not have duplicate number For | example |
The | |
original data tensor | masked_data |
return the segment lengths of a corresponding segmented tensor after BooleanMask is applied DOC | mask |
const float | minf = -1.0f * std::numeric_limits<float>::infinity() |
reconstruct values together according to masks A comprehensive | False |
reconstruct values together according to masks A comprehensive | True |
reconstruct values together according to masks A comprehensive False | values1 = 1.0 |
reconstruct values together according to masks A comprehensive False | mask2 = False |
reconstruct values together according to masks A comprehensive False False | values2 |
reconstruct values together according to masks A comprehensive False False True | values3 = 4.0 |
reconstruct values together according to masks A comprehensive False False True Reconstruct | by |
reconstruct values together according to masks A comprehensive False False True Reconstruct | mask3 |
reconstruct values together according to masks A comprehensive False False True Reconstruct We | get |
reconstruct values together according to masks A comprehensive False False True Reconstruct We Note that for all mask | positions |
reconstruct values together according to masks A comprehensive False False True Reconstruct We Note that for all mask there must be at least one True If for a field there are multiple True | s |
The data type to which the elements of the input tensor are cast Strictly must be one of the types from DataType enum in TensorProto Output tensor with the same shape as input with type specified by the to | argument |
the gradient for the output of SpatialBN and the per channel mean and inverse std var vectors for the computes the per channel bias and scale gradient to be used during the backward pass for subsequent spatial batch normalization gradient calculation | Typically |
the gradient for the output of SpatialBN and the per channel mean and inverse std var vectors for the computes the per channel bias and scale gradient to be used during the backward pass for subsequent spatial batch normalization gradient calculation the results of this op are subsequently reduced over multiple devices to obtain statistics over a larger batch size in cases where the batch size for a single model copy is too low to yield the full benefit of batch normalization The resulting bias and scale can then be plugged back into SpatialBNGradient to get results over the larger batch size DOC | mean |
the gradient for the output of SpatialBN and the per channel mean and inverse std var vectors for the computes the per channel bias and scale gradient to be used during the backward pass for subsequent spatial batch normalization gradient calculation the results of this op are subsequently reduced over multiple devices to obtain statistics over a larger batch size in cases where the batch size for a single model copy is too low to yield the full benefit of batch normalization The resulting bias and scale can then be plugged back into SpatialBNGradient to get results over the larger batch size DOC The mean saved from the forward pass as a dimensional tensor of size C | output_grad |
the gradient for the output of SpatialBN and the per channel mean and inverse std var vectors for the computes the per channel bias and scale gradient to be used during the backward pass for subsequent spatial batch normalization gradient calculation the results of this op are subsequently reduced over multiple devices to obtain statistics over a larger batch size in cases where the batch size for a single model copy is too low to yield the full benefit of batch normalization The resulting bias and scale can then be plugged back into SpatialBNGradient to get results over the larger batch size DOC The mean saved from the forward pass as a dimensional tensor of size C Gradient for the output layer of | SpatialBN |
the gradient for the output of SpatialBN and the per channel mean and inverse std var vectors for the computes the per channel bias and scale gradient to be used during the backward pass for subsequent spatial batch normalization gradient calculation the results of this op are subsequently reduced over multiple devices to obtain statistics over a larger batch size in cases where the batch size for a single model copy is too low to yield the full benefit of batch normalization The resulting bias and scale can then be plugged back into SpatialBNGradient to get results over the larger batch size DOC The mean saved from the forward pass as a dimensional tensor of size C Gradient for the output layer of here used as input because we are on the backward pass | bias_grad |
computes the sum of all elements per channel and the sum of all elements squared per channel These values can be reduced across multiple batches and used to obtain the mean and variance across the full set of batches Using the new mean and variance as input to SpatialBN has the effect of changing the batch size over which SpatialBN is applied DOC | sum |
kv_handler | |
Key value handler for | comm_world |
Key value handler for A common world for collective operations int rank of this node in the common | world |
existing_comm_world | |
common_world | |
The common world to be | destroyed |
The common world | Y |
The common world The reduced result on | root |
The common world The allreduced | tensor |
The common world The allreduced same on all | nodes |
The common world | dst |
The common world An int CPUtensor of size specifying the rank If | given |
The common world An int CPUtensor of size specifying the rank If this overrides the to argument of the op The rank to send the tensor to bool if | set |
The common world An int CPUtensor of size specifying the rank If this overrides the to argument of the op The rank to send the tensor to bool if only send the content and assume that the receiver has already known the tensor s shape and | information |
The common world | src |
The common world An int CPUtensor of size specifying the rank If this overrides the from argument of the op The received tensor | tag |
INT_MAX | split |
INT_MAX Optional list of output Which axis to split on Either NHWC or | NCWH |
INT_MAX Optional list of output Which axis to split on Either NHWC or will split on C | axis |
Which axis to concat on Pass to add the axis specified in arg axis to all input tensors | concat_result |
apply conditional | DataT |
apply conditional Data to use when True | DataO |
apply conditional Data to use when True Output data after applying | ConditionalOp |
const char * | kConvDoc |
the filter | blob |
the filter and the bias and computes the output Note that other | parameters |
the filter and the bias and computes the output Note that other such as the stride and kernel or the pads sizes in each direction are not necessary for input because they are provided by the ConvTransposeUnpoolOpBase | operator.Various dimension checks are done implicitly, and the sizes are specified in the Input docs for this operator.As is expected, the filter is deconvolved with a subset of the image and the bias is added |
this is done throughout the image data and the output is computed As a side note on the implementation | layout |
this is done throughout the image data and the output is computed As a side note on the implementation which is why they are separate files DOC | filter |
this is done throughout the image data and the output is computed As a side note on the implementation which is why they are separate files DOC The filter blob that will be used in the transposed | convolution |
has where C is the number of | channels |
element wise DOC The cosine of the input tensor computed element | wise |
counter | |
A blob pointing to an instance of a new counter Resets counter to this | value |
Input blob from the previous | layer |
Input blob from the previous which is almost always the result of a softmax | operation |
X is a array of size N x | D |
X is a array of size N x where N is the batch size and D is the number of classes Output blob after the cross entropy | computation |
logits = in[0] | |
matrix of logits for each example and class | xentropy |
matrix of logits for each example and class | weights |
or input tensor | Z |
Y with different shapes and produces one output float tensor of the dot product between X and Y We currently support two kinds of strategies to achieve this Before doing normal dot_product pad the smaller Y must be equal Only the second dimension of X or Y can be padded DOC or input tensor whether to replicate the smaller tensor or | not |
INT_MAX Subnet with blob bindings Indices of corresponding outer workspace | blobs |
INT_MAX Subnet with blob bindings Indices of corresponding outer workspace in | order |
INT_MAX Subnet with blob bindings Indices of corresponding outer workspace in List of blobs from the forward Do int out | bool { return true |
ArgumentHelper | argsHelper (def) |
auto | output_mask = !argsHelper.GetSingleArgument<bool>("is_test", 0) |
float | |
default the ratio of random dropout The input data as Tensor The output mask If is_test is | nonzero |
describes the axis of the | inputs |
defaults to one because the axis most likely describes the | batch_size |
const char * | kBroadcastDoc |
and the dimensions of the second input is the contiguous subset of the dimensions of the first For the following tensor shapes are | supported |
element wise This operation can be done in an in place fashion | too |
the data type of value is used The output tensor shape is specified by the shape argument If the number of input | is |
the data type of value is used The output tensor shape is specified by the shape argument If the number of input the shape will be identical to that of the input at run time with optional additional dimensions appended at the end as specified by extra_shape argument In that case the shape argument should not be set If input_as_shape is set to | true |
index = 0 | |
query | |
TIndex | outer = 1 |
TIndex | inner = 1 |
A tensor of | rank |
and then scaling each element to an bit number between and To later de quantize | values |
followed by the bias as a bit float in the next | bytes |
followed by the bias as a bit float in the next and the quantized values in the preceding bytes of the row The output is a matrix containing only the but de quantized De quantization is performed by multiplying each value by its row s scale and bias parameters The de quantized values will thus not be exactly equal to the | original |
followed by the bias as a bit float in the next and the quantized values in the preceding bytes of the row The output is a matrix containing only the but de quantized De quantization is performed by multiplying each value by its row s scale and bias parameters The de quantized values will thus not be exactly equal to the un quantized floating point values DOC | float_input |
but operating on bit rowwise quantized matrices with fused uint8 tensor with rank obtained with | OUTPUT |
in a sequence length aware fashion | Concretely |
in a sequence length aware fashion given the previous hidden and the sequence computes the GRU | activations |
in a sequence length aware fashion given the previous hidden and the sequence computes the GRU avoiding computation if the input is Bool to determine if hidden state is zeroes or passed along for timesteps past the given sequence_length | hidden |
in a sequence length aware fashion given the previous hidden and the sequence computes the GRU avoiding computation if the input is Bool to determine if hidden state is zeroes or passed along for timesteps past the given sequence_length The new GRU hidden state calculated by this | op |
When | false |
When the sequence lengths input is left and all following inputs are shifted left by | one |
INT_MAX Net executed when condition is true | condition |
auto | pad = helper.GetSingleArgument<int>("pad", 0) |
auto | kernel_h |
auto | kernel_w |
auto | dilation_h |
auto | dilation_w |
auto | stride_h |
auto | stride_w |
int | N = 0 |
int | C = 0 |
int | H = 0 |
int | W = 0 |
const int | dkernel_h = dilation_h * (kernel_h - 1) + 1 |
const int | dkernel_w = dilation_w * (kernel_w - 1) + 1 |
const int | out_h = (H + 2 * pad - dkernel_h) / stride_h + 1 |
const int | out_w = (W + 2 * pad - dkernel_w) / stride_w + 1 |
Max number of | elements |
return an Int tensor of same shape containing the indices for each of the keys If the index is | frozen |
return an Int tensor of same shape containing the indices for each of the keys If the index is unknown entries are given index | Otherwise |
return an Int tensor of same shape containing the indices for each of the keys If the index is unknown entries are given index new entries are added into the index If an insert is necessary but max_elements has been | reached |
return an Int tensor of same shape containing the indices for each of the keys If the index is unknown entries are given index new entries are added into the index If an insert is necessary but max_elements has been fail DOC | keys |
disallowing creation of new index entries Should not be called concurrently with IndexGet DOC | handle |
Pointer to an Index instance The input handle If skips the first entry of the tensor This allows to load tensors that are aligned with an | embedding |
Pointer to an Index instance The input handle If skips the first entry of the tensor This allows to load tensors that are aligned with an where the first entry corresponds to the default index | entry |
the first element of the output tensor will be element of index DOC | items |
array of probabilities for prediction | L |
array of probabilities for prediction array of JSD | losses |
INT_MAX | |
auto | input_dims_long = GetDimsVector(in[0]) |
const auto | canonical_axis |
int default | to |
Coefficient of | leakage |
but operating on bit rowwise quantized matrices with fused uint8 tensor obtained with | LENGTHS |
where | |
for each | row |
for each weights are accessed by | indices [0..L-1] |
for each weights are accessed by where L is the length of given row This is basically a fused | WEIGHT |
m_2 | |
m_n | |
reshape it into matrix of we compute | scale_i |
reshape it into matrix of we compute where min_i and max_i minimum and maximum elements of i th and quantize each element | r_ |
reshape it into matrix of we compute where min_i and max_i minimum and maximum elements of i th and quantize each element each row r_i of which stores a pair | s_i |
reshape it into matrix of we compute where min_i and max_i minimum and maximum elements of i th and quantize each element each row r_i of which stores a pair | b_i |
and LENGTHS tensor of duplicate each entry of the outer most dimension of DATA according to and concatenate them in an output tensor of rank r | Example |
where segments are defined by their and concatenate them in an output tensor of | shape =(SIZE(LENGTHs), k). In case there's less than k values in a segment |
where segments are defined by their and concatenate them in an output tensor of the output value will be padded and the corresponding output indices will be padded by DOC Tensor of int32 lengths of rank | TopKIndices |
exists | |
INT_MAX | int |
INT_MAX default if use the db path directly and do not prepend the current root folder of the workspace | string |
INT_MAX default if use the db path directly and do not prepend the current root folder of the workspace | default |
but allows one to save to db every few | iterations |
stride | sizes |
stride and pad lengths defined by the ConvPoolOpBase Output data tensor from L p pooling across the input tensor Dimensions will vary based on various | kernel |
stride and pad lengths defined by the ConvPoolOpBase Output data tensor from L p pooling across the input tensor Dimensions will vary based on various | stride |
and label is applied to the tensor elementwise If | y |
ArgumentHelper | arg_helper (def) |
int | axis_a = arg_helper.GetSingleArgument<int>("axis_a", 1) |
int | axis_b = arg_helper.GetSingleArgument<int>("axis_b", 1) |
int | trans_a = arg_helper.GetSingleArgument<bool>("trans_a", false) |
int | trans_b = arg_helper.GetSingleArgument<bool>("trans_b", false) |
int | canonical_axis_a = canonical_axis_index_(axis_a, in[0].dims().size()) |
int | canonical_axis_b = canonical_axis_index_(axis_b, in[0].dims().size()) |
int | M = size_to_dim_(canonical_axis_a, GetDimsVector(in[0])) |
A | |
matrix of | B |
data_0 | |
prediction | |
D float i | e |
D float i batch size D is number of possible classes labels | accuracies |
axis to | normalize |
lengths | |
d int long tensor contains the length in each of the output | packed_tensor |
d int long tensor contains the length in each of the output N dim Tensor where | presence_mask |
d int long tensor contains the length in each of the output N dim Tensor where dim boolean false where packed_tensor is | padded |
d int long tensor contains the length in each of the input N dim | Tensor |
CPUContext::PadTensorInference Input data tensor from the previous | operator |
dimensions depend on whether the NCHW or NHWC operators are being used For in the | former |
given a sample set of raw labeled with their corresponding percentiles from the same distribution In | particular |
given a sample set of raw labeled with their corresponding percentiles from the same distribution In this | value_to_pct |
given a sample set of raw labeled with their corresponding percentiles from the same distribution In this Sorted with columns Each element in the first column is a float representing the raw value of a sample Its corresponding element in the next column represents the percentile it maps to | percentile_values |
given a sample set of raw labeled with their corresponding percentiles from the same distribution In this Sorted with columns Each element in the first column is a float representing the raw value of a sample Its corresponding element in the next column represents the percentile it maps to tensor of | floats |
given a sample set of raw labeled with their corresponding percentiles from the same distribution In this Sorted with columns Each element in the first column is a float representing the raw value of a sample Its corresponding element in the next column represents the percentile it maps to tensor of with the same dimensions as the flattened input tensor Each element of this corresponds to the percentile calculated for | original_values [i] |
probabilities | |
a D or D slopes and intercepts The output tensor has the same shape of input predictions and contains the predictions transformed by the piecewise linear functions Each column of predictions has its own piecewise linear transformation functions Therefore the size of piecewise function parameters are pieces x | prediction_dimensions |
a D or D slopes and intercepts The output tensor has the same shape of input predictions and contains the predictions transformed by the piecewise linear functions Each column of predictions has its own piecewise linear transformation functions Therefore the size of piecewise function parameters are pieces x except for binary predictions where only the positive prediction needs them Note that in each | piece |
a D or D slopes and intercepts The output tensor has the same shape of input predictions and contains the predictions transformed by the piecewise linear functions Each column of predictions has its own piecewise linear transformation functions Therefore the size of piecewise function parameters are pieces x except for binary predictions where only the positive prediction needs them Note that in each low bound is excluded while high bound is included Also the piecewise linear function must be continuous Notes If the input is binary set the binary arg to true so that one group of piecewise linear functions is | slopes |
a D or D slopes and intercepts The output tensor has the same shape of input predictions and contains the predictions transformed by the piecewise linear functions Each column of predictions has its own piecewise linear transformation functions Therefore the size of piecewise function parameters are pieces x except for binary predictions where only the positive prediction needs them Note that in each low bound is excluded while high bound is included Also the piecewise linear function must be continuous Notes If the input is binary set the binary arg to true so that one group of piecewise linear functions is intercepts can be passed either through args or through input blobs If we have multiple groups of piecewise linear | functions |
a D or D slopes and intercepts The output tensor has the same shape of input predictions and contains the predictions transformed by the piecewise linear functions Each column of predictions has its own piecewise linear transformation functions Therefore the size of piecewise function parameters are pieces x except for binary predictions where only the positive prediction needs them Note that in each low bound is excluded while high bound is included Also the piecewise linear function must be continuous Notes If the input is binary set the binary arg to true so that one group of piecewise linear functions is intercepts can be passed either through args or through input blobs If we have multiple groups of piecewise linear each group has the same number of pieces If a prediction is out of the | bounds |
const char * | kAveragePoolDoc |
const char * | kMaxPoolDoc |
and produces one output is applied to the data tensor elementwise DOC | Slope |
Size of the dimension to prepend | reshaped |
Number of dimensions to reduce | data_in |
the value to replace | NaN |
auto | actualNewShape = helper.GetRepeatedArgument<int64_t>("shape") |
int64_t | totalSize = 1 |
int | unknownIdx = -1 |
else | |
an extra argument shape must be specified It outputs the reshaped tensor as well as the original shape At most one dimension of the new shape can be In this | case |
an extra argument shape must be specified It outputs the reshaped tensor as well as the original shape At most one dimension of the new shape can be In this the value is inferred from the size of the tensor and the remaining dimensions A dimension could also | be |
segments | |
embeddings | |
Prefix string to prepend extracted blobs | blob_names |
Prefix string to prepend extracted blobs tensor of strings containing extracted blob | names |
the implementation takes an the hidden state the cell and a weight | TxNxD |
the implementation takes an the hidden state the cell and a weight the final hidden cell | bidirectional |
the implementation takes an the hidden state the cell and a weight the final hidden cell | num_layers |
the implementation takes an the hidden state the cell and a weight the final hidden cell | rnn_mode |
the implementation takes an the hidden state the cell and a weight the final hidden cell input_mode are passed directly through to CuDNN | DOC |
R recurrent or input R | all_params |
See RoIPoolF | dY |
See RoIPoolF Gradient of forward | dX |
Spatial scale of the input feature map X relative to the input image E | g |
Pooled output Y s width feature map input of | RoIs |
there are multiple output | cases |
default the scale to | apply |
affects the activation function itself This should go with the weight initialization in the paper See | https |
Number of copies of padding to add around each range T< N, D1..., Dn > Input data | start_padding |
Number of copies of padding to add around each range T< N, D1..., Dn > Input data T< D1..., Dn > Padding data for range start | data_out |
Outer size of padding present around each range T< N, D1..., Dn > Padded input data | padding_sum |
Outer size of padding present around each range T< N, D1..., Dn > Padded input data Sum of all start | paddings |
rather | |
it will be coerced into one For an arbitrary n dimensional tensor X in[a_0, a_1,..., a_{k-1}, a_k,..., a_{n-1}] and k is the axis | provided |
it will be coerced into one For an arbitrary n dimensional tensor X in[a_0, a_1,..., a_{k-1}, a_k,..., a_{n-1}] and k is the axis then X will be coerced into a dimensional tensor with dimensions[a_0 *...*a_{k-1}, a_k *...*a_{n-1}] For the default case where this means the X tensor will be coerced into a tensor of | dimensions [a_0, a_1 *...*a_{n-1}] |
it will be coerced into one For an arbitrary n dimensional tensor X in[a_0, a_1,..., a_{k-1}, a_k,..., a_{n-1}] and k is the axis then X will be coerced into a dimensional tensor with dimensions[a_0 *...*a_{k-1}, a_k *...*a_{n-1}] For the default case where this means the X tensor will be coerced into a tensor of where a_0 is often the batch size In this | situation |
it will be coerced into one For an arbitrary n dimensional tensor X in[a_0, a_1,..., a_{k-1}, a_k,..., a_{n-1}] and k is the axis then X will be coerced into a dimensional tensor with dimensions[a_0 *...*a_{k-1}, a_k *...*a_{n-1}] For the default case where this means the X tensor will be coerced into a tensor of where a_0 is often the batch size In this we must have | a_0 |
auto | labels = in[1] |
const int | num_classes |
Unscaled log probabilities | weight_tensor |
Unscaled log probabilities Optional blob to be used to weight the samples for the loss | loss |
this op outputs a copy of the input tensor where values from the height and width dimensions are moved to the batch dimension After the zero | padding |
followed by cropping This is the reverse transformation of SpaceToBatch More | specifically |
param | |
Parameters to be normalized | grad |
Parameters to be normalized Gradient computed A bool variable to control whether to use max norm or constant norm When | use_max_norm = false |
Parameters to be normalized Gradient computed A bool variable to control whether to use max norm or constant norm When constant norm is used so that all the embedding vectors are scaled to have a L2 norm equals to max norm is used so that embedding is scaled so that its l2 norm is no larger than A If an embedding s norm is less than A | originally |
Unscaled log probabilities Optional blob to be used to weight the samples for the loss With spatial weighting is by | x |
A Blob pointing to the newly created | StatRegistry |
If export values from given StatRegistry export values from the global singleton StatRegistry int64 tensor with exported values default true Whether to atomically reset the counters | afterwards |
returning a pointer to it The timer is stopped by calling TimerEnd DOC | timer |
returning a pointer to it The timer is stopped by calling TimerEnd DOC Pointer to to be passed to | TimerEnd |
stops the timer publishing a CAFFE_EVENT DOC | nanos |
stops the timer publishing a CAFFE_EVENT DOC nanoseconds in | int64 |
utilization | |
Delta in max CPU utilization | observed |
default flag to indicate if the summarized statistics have to be written to a log file D | max |
default flag to indicate if the summarized statistics have to be written to a log file D mean and standard | deviation |
constexpr char | kSummaryzeOpExtension [] = ".summary" |
auto | tiles = helper.GetSingleArgument<int32_t>("tiles", 1) |
Number of replicas The input tensor optional Axis to replicate | tiled_data |
a_2 | |
a_n | |
r and integer argument | k |
r and integer argument return two k which contains the values of the top k elements along the last dimension Index tensor of shape[a_1, a_2,..., a_n, k] which contains the indices of the top k this | Values |
vector< int > | axes = helper.GetRepeatedArgument<int>("axes") |
auto | valid_axes |
A list of integers By reverse the otherwise permute the axes according to the values given | transposed |
time | |
The time in | nanoseconds |
bool | if |
bool saves contents to the root folder of the current | workspace |
int | total = 1 |
A tensor of sharing its | storage |
weight tensor pairs Input should be in the form | X_0 |
weight tensor pairs Input should be in the form | weight_0 |
weight tensor pairs Input should be in the form | X_1 |
weight tensor pairs Input should be in the form | weight_1 |
Tensor to be updated | INDICES |
Tensor to be updated D list of indices on the first dimension of X_0 that need to be updated | Weight_1 |
Tensor to be updated | SLICES |
Tensor to be updated Update | slices |
const char | kPrintFileExtension [] = ".log" |
const ArgumentHelper | args (def) |
sampling_cdf | |
An optional D Tensor< float > Input cumulative sampling all values in sampling_cdf will be scaled by this number | sampled_indexes |
an index is randomly sampled from the distribution given by the weights of the corresponding batch The output is a D | sampling_weights |
an index is randomly sampled from the distribution given by the weights of the corresponding batch The output is a D A D Tensor< float > of | sampling_values |
an index is randomly sampled from the distribution given by the weights of the corresponding batch The output is a D A D Tensor< float > of An optional D Tensor< float > of The output tensor contains | sampled_values |
Timeout in | secs |
Timeout in | queue |
The shared pointer for the | BlobsQueue |
the output status will be set to true which can be used as exit criteria for execution step The input is the queue and the last output is the status The rest are data blobs DOC The shared pointer for the BlobsQueue | status |
the output status will be set to true which can be used as exit criteria for execution step The input is the queue and the last output is the status The rest are data blobs DOC The shared pointer for the BlobsQueue Is set to depending on the success of | dequeue |
Parameters to be updated Gradient computed | output_param |
moment | |
lr | |
runs the dense AdaGrad update and Parameters to be updated Sparse indices learning rate | output_moment_1 |
runs a modified sparse Adagrad update and where moment is a tensor with length equal to the number of rows in and the new moment is calculated by adding the average squared sum of gradients across each row Note that indices must also be a tensor indexing into the rows of param DOC Moment history Gradient computed Updated parameters | Default |
Parameters to be updated | moment_2 |
Parameters to be updated Second moment history learning rate Updated parameters | output_moment_2 |
Parameters to be updated Second moment history Gradient computed | iter |
input_tensor | |
Tensor of floats to be clipped | additional_threshold |
Tensor of floats to be clipped An optional additonal threshold to scale the orignal threshold | clipped |
mutex | |
given | nesterov |
given | computes |
given | adjusted_gradient |
given param | momentum |
given param parameter Note the difference to | MomentumSGD |
GradientSlice with gradients for updated indices Learning rate Adjusted gradient Updated parameter boolean Whether to use Nesterov Accelerated | Gradient |
Parameters to be updated Learning rate | curv_win |
Parameters to be updated Learning rate Memory for latest curvature ranges | g2_avg |
Parameters to be updated Learning rate Memory for latest curvature ranges Moving average of squared gradient Gradient computed Parameters to be updated | output_lr |
Parameters to be updated Learning rate Memory for latest curvature ranges Moving average of squared gradient Gradient computed Parameters to be updated Output learning rate | output_curv_win |
Parameters to be updated Learning rate Memory for latest curvature ranges Moving average of squared gradient Gradient computed Parameters to be updated Output learning rate Output memory for latest curvature ranges | output_g2_avg |
INT_MAX | compressed |
constexpr size_t | kDefaultMinWorkSize = 80 |
constexpr size_t | kCacheLineSize = 64 |
constexpr size_t | kGEMMLOWPCacheLineSize = 64 |
const int | kMaxBusyWaitNOPs = 32 * 1000 * 1000 |
where N is the number of elements in the | batch |
where N is the number of elements in the H and W are the height and | width |
where N is the number of elements in the H and W are the height and and each of length num_classes The softmax is applied to each group independently | See |
number of classes in each softmax group tensor of softmax probabilities with where and softmax was applied to each of the num_anchors | groups |
scores | |
See GroupSpatialSoftmax | d_scores |
L2 to L1 transition point | Y_hat |
L2 to L1 transition point tensor of bounding box regression predictions with tensor of labels | locations |
See SelectSmoothL1Loss See SelectSmoothL1Loss | d_loss |
See SelectSmoothL1Loss See SelectSmoothL1Loss Gradient of forward | d_Y_hat |
where indicates that the corresponding sample should be ignored | and |
multiply the loss by this scale factor Tensor of predicted | targets |
where N is the number of elements in the H and W are the height and and each of length num_classes For the binary form of Focal | Loss |
See SigmoidFocalLoss | normalizer |
See SigmoidFocalLoss See SigmoidFocalLoss | d_logits |
NumInputs(4).NumOutputs(1).SetDoc(R"DOC( Smooth L1 Loss is a minor variation of Huber loss in which the point of transition between L2 loss and L1 loss is adjustable by a hyper-parameter beta L2 to L1 transition point Tensor of Tensor of labels with the same shape as Y_hat | alpha_out |
See SmoothL1Loss | alpha_in |
where N is the number of elements in the H and W are the height and and where | p_i = exp(s_i) / sum_j exp(s_j) |
or input of | H0 |
See SpatialNarrowAs | dC |
See SpatialNarrowAs Gradient of forward | dA |
A global dictionary that holds information about what Caffe2 modules have been loaded in the current runtime, and also utility functions to load modules.
Copyright (c) 2016-present, Facebook, Inc.
Copyright (c) 2016, NVIDIA CORPORATION, All rights reserved.
Copyright 2016 Facebook.
Simple registry implementation in Caffe2 that uses static variables to register object creators during program initialization time.
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
using caffe2::UnaryElementwiseOp = typedef UnaryElementwiseWithArgsOp< InputTypes, Context, WithDefaultConstructor<Functor>, OutputType> |
UnaryElementwiseOp is a wrapper around UnaryElementwiseWithArgsOp, with the difference that it takes a functor with default constructor, e.g.
that does not need to take into consideration any arguments during operator creation.
Definition at line 93 of file elementwise_op.h.
int caffe2::CaffeCudaGetDevice | ( | ) |
Gets the current GPU id.
This is a simple wrapper around cudaGetDevice().
Definition at line 109 of file common_gpu.cc.
void caffe2::CaffeCudaSetDevice | ( | const int | id | ) |
Gets the current GPU id.
This is a simple wrapper around cudaGetDevice().
Definition at line 122 of file common_gpu.cc.
Creates a network, accessing / creating blobs in the given workspace.
Note that this is different from Workspace::CreateNet. The latter adds the created net object to the workspace's net map, while this function returns a standalone net object.
std::unique_ptr<RecurrentNetworkExecutorBase> caffe2::createRNNExecutor< CPUContext > | ( | const NetDef & | step_net_def, |
std::map< string, string > & | recurrent_input_map, | ||
std::string | timestep_blob, | ||
ArgumentHelper | rnn_args | ||
) |
Implementation of RecurrentNetworkExecutor that uses thread pool for multithreaded execution of RNNs.
Used with CPU.
Definition at line 13 of file recurrent_network_executor.cc.
void caffe2::createSharedBuffer | ( | Workspace * | ws | ) |
Creates a mutex and shared buffer in the workspace.
Not thread-safe, must be called from the constructor.
const CaffeMap< string, const ModuleSchema * > & caffe2::CurrentModules | ( | ) |
Current Modules present in the Caffe2 runtime.
Returns: map: a map of modules and (optionally) their description. The key is the module name, and the value is the description for that module. The module name is recommended to be the part that constitutes the trunk of the dynamic library: for example, a module called libcaffe2_db_rocksdb.so should have the name "caffe2_db_rocksdb". The reason we do not use "lib" is because it's somewhat redundant, and the reason we do not include ".so" is for cross-platform compatibility on platforms like mac os.
void caffe2::EmbeddingLookup | ( | const TIndex | block_size, |
const TIndex | output_size, | ||
const TIndex | index_size, | ||
const TIndex | data_size, | ||
const InType * | input, | ||
const IndexType * | indices, | ||
const int * | lengths, | ||
const float * | weights, | ||
const float * | scale_bias, | ||
bool | normalize_by_lengths, | ||
OutType * | out | ||
) |
Embedding lookup with reduction.
input
of size data_size * block_size indices
of size index_size lengths
of size output_size weights
nullptr or array of size index_size out
of size output_size * block_size sum(lengths[i]) == index_size
Behavior is roughly equivalent to pseudocode:
pos = 0 for (i = 0..index_size-1) for (k = 0..block_size-1) out[i*block_size + k] = 0 for (j = 0..lengths[i]-1) for (k = 0..block_size-1) out[i*block_size + k] += input[indices[pos]*block_size + k] * (weights ? weights[IS_WEIGHT_POSITIONAL ? j : pos] : 1.0) pos += 1 if (normalize_weights && lengths[i] > 0) for (k = 0..block_size-1) out[i*block_size + k] /= lengths[i]
void caffe2::Fused8BitRowwiseEmbeddingLookup | ( | const TIndex | block_size, |
const TIndex | output_size, | ||
const TIndex | index_size, | ||
const TIndex | data_size, | ||
const InType * | input, | ||
const IndexType * | indices, | ||
const int * | lengths, | ||
const float * | weights, | ||
bool | normalize_by_lengths, | ||
OutType * | out | ||
) |
Embedding lookup with reduction.
input
of size data_size * (block_size + 8B) indices
of size index_size lengths
of size output_size weights
nullptr or array of size index_size out
of size output_size * block_size sum(lengths[i]) == index_size
Note that block_size should be the number of quantized values per row in the data, i.e. excluding the scale and bias. The total (fused) block size is assumed to be this block_size, plus 4 bytes for scale and 4 bytes for bias.
Behavior is roughly equivalent to pseudocode:
pos = 0 fused_block_size = block_size + 8B // quantized values and scale and bias for (i = 0..index_size-1) for (k = 0..block_size-1) out[i*block_size + k] = 0 for (j = 0..lengths[i]-1) for (k = 0..block_size-1) out[i*block_size + k] += input[indices[pos]*(fused_block_size) + k] * (weights ? weights[IS_WEIGHT_POSITIONAL ? j : pos] : 1.0) pos += 1 if (normalize_weights && lengths[i] > 0) for (k = 0..block_size-1) out[i*block_size + k] /= lengths[i]
CudaMemoryPoolType caffe2::GetCudaMemoryPoolType | ( | ) |
Gets the current memory pool type used by Caffe2.
The memory pool is set up during caffe2's global initialization time.
bool caffe2::GetCudaPeerAccessPattern | ( | vector< vector< bool > > * | pattern | ) |
Return a peer access pattern by returning a matrix (in the format of a nested vector) of boolean values specifying whether peer access is possible.
This function returns false if anything wrong happens during the query of the GPU access pattern.
Definition at line 218 of file common_gpu.cc.
const cudaDeviceProp & caffe2::GetDeviceProperty | ( | const int | device | ) |
Gets the device property for the given device.
This function is thread safe.
Definition at line 166 of file common_gpu.cc.
bool caffe2::GlobalInit | ( | int * | pargc, |
char *** | argv | ||
) |
Initialize the global environment of caffe2.
Caffe2 uses a registration pattern for initialization functions. Custom initialization functions should take the signature bool (func)(int, char***) where the pointers to argc and argv are passed in. Caffe2 then runs the initialization in three phases: (1) Functions registered with REGISTER_CAFFE2_EARLY_INIT_FUNCTION. Note that since it is possible the logger is not initialized yet, any logging in such early init functions may not be printed correctly. (2) Parses Caffe-specific commandline flags, and initializes caffe logging. (3) Functions registered with REGISTER_CAFFE2_INIT_FUNCTION. If there is something wrong at each stage, the function returns false. If the global initialization has already been run, the function returns false as well.
MPI_Comm caffe2::GlobalMPIComm | ( | ) |
Gets the global MPI communicator used by Caffe2.
In default, this is MPI_COMM_WORLD unless you call SetGlobalMPIComm().
Definition at line 20 of file mpi_common.cc.
|
inline |
Check if the current running session has a cuda gpu present.
Note that this is different from having caffe2 built with cuda. Building Caffe2 with cuda only guarantees that this function exists. If there are no cuda gpus present in the machine, or there are hardware configuration problems like an insufficient driver, this function will still return false, meaning that there is no usable GPU present.
In the open source build, it is possible that Caffe2's GPU code is dynamically loaded, and as a result a library could be only linked to the CPU code, but want to test if cuda is later available or not. In this case, one should use HasCudaRuntime() from common.h.
Definition at line 99 of file common_gpu.h.
caffe2::IdenticalTypeAndShape | ( | ) |
void caffe2::LoadModule | ( | const string & | name, |
const string & | filename = "" |
||
) |
|
noexcept |
ScopeGuard is a general implementation of the "Initialization is Resource Acquisition" idiom.
Basically, it guarantees that a function is executed upon leaving the currrent scope unless otherwise told.
The MakeGuard() function is used to create a new ScopeGuard object. It can be instantiated with a lambda function, a std::function<void()>, a functor, or a void(*)() function pointer.
Usage example: Add a friend to memory iff it is also added to the db.
void User::addFriend(User& newFriend) { // add the friend to memory friends_.push_back(&newFriend);
// If the db insertion that follows fails, we should // remove it from memory. auto guard = MakeGuard([&] { friends_.pop_back(); });
// this will throw an exception upon error, which // makes the ScopeGuard execute UserCont::pop_back() // once the Guard's destructor is called. db_->addFriend(GetName(), newFriend.GetName());
// an exception was not thrown, so don't execute // the Guard. guard.dismiss(); }
Examine ScopeGuardTest.cpp for some more sample usage.
Stolen from: Andrei's and Petru Marginean's CUJ article: http://drdobbs.com/184403758 and the loki library: http://loki-lib.sourceforge.net/index.php?n=Idioms.ScopeGuardPointer and triendl.kj article: http://www.codeproject.com/KB/cpp/scope_guard.aspx
Definition at line 153 of file scope_guard.h.
bool caffe2::MatchStrings | ( | string | p, |
string | s | ||
) |
void caffe2::MPISetupPeers | ( | const int | replicas, |
const string & | role, | ||
const string & | job_path | ||
) |
A function used to perform peer setup so one does not need to use mpirun / mpiexec to run the binary.
Note that if you use mpirun or mpiexec to set up the common world, do not use this function - MPI_Init would have already set that up.
This also assumes that you have a common path (like NFS) that multiple instances can read from.
Inputs: replicas (int): the number of replicas that mpi will run with. role (string): the role of this process, "server" or "client". job_path (string): a file name that the server will write its port into and the clients will read the server's port from.
Definition at line 94 of file mpi_common.cc.
bool caffe2::ParseCaffeCommandLineFlags | ( | int * | pargc, |
char *** | pargv | ||
) |
Parses the commandline flags.
This command parses all the commandline arguments passed in via pargc and argv. Once it is finished, partc and argv will contain the remaining commandline args that caffe2 does not deal with. Note that following convention, argv[0] contains the binary name and is not parsed.
void caffe2::SetGlobalMPIComm | ( | MPI_Comm | new_comm | ) |
Sets the global MPI communicator.
Caffe2 takes over the ownership of the passed in communicator.
Definition at line 24 of file mpi_common.cc.
void caffe2::ShowLogInfoToStderr | ( | ) |
A utility to allow one to show log info to stderr after the program starts.
This is similar to calling GLOG's –logtostderr, or setting caffe2_log_level to smaller than INFO. You are recommended to only use this in a few sparse cases, such as when you want to write a tutorial or something. Normally, use the commandline flags to set the log level.
Definition at line 196 of file logging.cc.
it will be coerced into one For an arbitrary n dimensional tensor X in [a_0, a_1, ..., a_{k-1}, a_k, ..., a_{n-1}] and k is the axis then X will be coerced into a dimensional tensor with dimensions [a_0 * ... * a_{k-1}, a_k * ... * a_{n-1}] For the default case where this means the X tensor will be coerced into a tensor of where a_0 is often the batch size In this we must have caffe2::a_0 |
Definition at line 98 of file softmax_op.cc.
where indicates that the corresponding sample should be ignored caffe2::and |
Definition at line 34 of file sigmoid_cross_entropy_loss_op.cc.
Number of replicas The input tensor caffe2::axis |
Definition at line 29 of file concat_split_op.cc.
auto caffe2::batch_size |
Definition at line 102 of file elementwise_linear_op.cc.
const auto caffe2::canonical_axis |
Definition at line 169 of file layer_norm_op.cc.
Number of replicas caffe2::data |
Definition at line 101 of file image_input_op.cc.
int caffe2::default |
Definition at line 48 of file load_save_op.cc.
auto caffe2::dilation_h |
Definition at line 40 of file im2col_op.cc.
auto caffe2::dilation_w |
Definition at line 42 of file im2col_op.cc.
given param caffe2::else |
Definition at line 88 of file reshape_op.cc.
const vector< TensorShape > & caffe2::in |
Definition at line 12 of file batch_gather_ops.cc.
Focal Loss s gamma hyper parameter tensor of softmax caffe2::inputs |
Definition at line 92 of file elementwise_linear_op.cc.
const char* caffe2::kAveragePoolDoc |
Definition at line 730 of file pool_op.cc.
const char* caffe2::kBroadcastDoc |
Definition at line 7 of file elementwise_op_schema.cc.
const char* caffe2::kConvDoc |
Definition at line 7 of file conv_op.cc.
auto caffe2::kernel_h |
Definition at line 36 of file im2col_op.cc.
auto caffe2::kernel_w |
Definition at line 38 of file im2col_op.cc.
const char* caffe2::kMaxPoolDoc |
Definition at line 738 of file pool_op.cc.
auto caffe2::num_classes |
Definition at line 28 of file softmax_with_loss_op.cc.
auto caffe2::order |
See SoftmaxFocalLoss See SoftmaxFocalLoss Gradient of forward caffe2::output |
Tensor of rank caffe2::r |
Definition at line 29 of file batch_gather_ops.cc.
reshape it into matrix of we compute where min_i and max_i minimum and maximum elements of i th and quantize each element caffe2::r_ |
Definition at line 176 of file lengths_reducer_rowwise_8bit_ops.cc.
A tensor of caffe2::rank |
Definition at line 32 of file flatten_op.cc.
reshape it into matrix of we compute caffe2::scale_i |
Definition at line 174 of file lengths_reducer_rowwise_8bit_ops.cc.
auto caffe2::stride_h |
Definition at line 44 of file im2col_op.cc.
auto caffe2::stride_w |
Definition at line 46 of file im2col_op.cc.
auto caffe2::valid_axes |
Definition at line 35 of file transpose_op.cc.
reconstruct values together according to masks A comprehensive False False True Reconstruct We Note that for all mask there must be at least one True If for a field there are multiple True we will accept the first value For False False False True caffe2::values2 |
Definition at line 69 of file boolean_unmask_ops.cc.
caffe2::y |
Definition at line 80 of file margin_ranking_criterion_op.cc.