A GPU operator that can generate limited elementwise operations. More...
Public Member Functions | |
ElementwiseRTCOp (const OperatorDef &operator_def, Workspace *ws) | |
bool | RunOnDevice () override |
Public Member Functions inherited from caffe2::Operator< CUDAContext > | |
Operator (const OperatorDef &operator_def, Workspace *ws) | |
const Tensor< CUDAContext > & | Input (int idx) |
Tensor< CUDAContext > * | Output (int idx) |
void | WaitEvent (const Event &ev, int stream_id=-1) final |
void | WaitEvents (const std::vector< const Event * > &events, int stream_id=-1) final |
bool | Run (int stream_id=0) final |
bool | RunAsync (int stream_id=0) final |
bool | IsStreamFree (int stream_id) const override |
bool | HasAsyncPart () const override |
bool | SupportsAsyncScheduling () const override |
const CUDAContext * | getContext () const |
Public Member Functions inherited from caffe2::OperatorBase | |
OperatorBase (const OperatorDef &operator_def, Workspace *ws) | |
bool | HasArgument (const string &name) const |
Checks if the operator has an argument of the given name. | |
template<typename T > | |
T | GetSingleArgument (const string &name, const T &default_value) const |
template<typename T > | |
bool | HasSingleArgumentOfType (const string &name) const |
template<typename T > | |
vector< T > | GetRepeatedArgument (const string &name, const vector< T > &default_value={}) const |
template<typename T > | |
const T & | Input (int idx) |
template<typename T > | |
T * | Output (int idx) |
template<typename T > | |
T * | Output (int idx, T *allocated) |
const Blob & | InputBlob (int idx) |
Blob * | OutputBlob (int idx) |
template<typename T > | |
bool | InputIsType (int idx) |
template<typename T > | |
bool | OutputIsType (int idx) |
int | InputSize () const |
int | OutputSize () const |
const vector< const Blob * > & | Inputs () const |
const vector< Blob * > & | Outputs () |
vector< TensorShape > | InputTensorShapes () |
void | Wait (const OperatorBase &other, int stream_id=-1) |
virtual void | Finish () |
virtual void | AddRelatedBlobInfo (EnforceNotMet *err) |
const OperatorDef & | debug_def () const |
void | set_debug_def (const std::shared_ptr< const OperatorDef > &operator_def) |
bool | has_debug_def () const |
void | RecordLastFailedOpNetPosition () |
int | net_position () const |
void | set_net_position (int idx) |
const DeviceOption & | device_option () const |
const Event & | event () const |
Event & | event () |
void | ResetEvent () |
void | DisableEvent () |
bool | IsEventDisabled () const |
const std::string & | type () const |
void | annotate_engine (const std::string &engine) |
const std::string & | engine () const |
Public Member Functions inherited from caffe2::Observable< OperatorBase > | |
const Observer * | AttachObserver (std::unique_ptr< Observer > observer) |
std::unique_ptr< Observer > | DetachObserver (const Observer *observer_ptr) |
Returns a unique_ptr to the removed observer. More... | |
virtual size_t | NumObservers () |
void | StartAllObservers () |
void | StopAllObservers () |
Additional Inherited Members | |
Public Types inherited from caffe2::Observable< OperatorBase > | |
using | Observer = ObserverBase< OperatorBase > |
Static Public Attributes inherited from caffe2::OperatorBase | |
static constexpr int | kNoNetPositionSet = -1 |
Protected Member Functions inherited from caffe2::Operator< CUDAContext > | |
void | RecordEvent (const char *err_msg=nullptr) final |
std::string | getErrorMsg () |
Protected Member Functions inherited from caffe2::OperatorBase | |
DISABLE_COPY_AND_ASSIGN (OperatorBase) | |
Protected Attributes inherited from caffe2::Operator< CUDAContext > | |
CUDAContext | context_ |
Protected Attributes inherited from caffe2::OperatorBase | |
std::unique_ptr< Event > | event_ |
Protected Attributes inherited from caffe2::Observable< OperatorBase > | |
std::vector< std::unique_ptr< Observer > > | observers_list_ |
A GPU operator that can generate limited elementwise operations.
ElementwiseRTCOp allows one to do a simple and limited thing: it takes in multiple inputs and multiple outputs, as well as a raw string argument rtc_src. The runtime then generates the following kernel code:
global void kernel_name(const size_t nthreads, ...) { for(int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { rtc_src } } where the "..." part is auto generated, so one can refer to the input and output as in0, in1, ..., out0, out1... in the rtc_src string.
For example, if one wants to do a vector multiplication, one can take two inputs and one outputs, and write rtc_src as out0[index] = in0[index] * in1[index];
This op is currently highly experimental. We do not have a gradient registered for it either.
Definition at line 74 of file elemenntwise_rtc_gpu.cc.