1 #include "caffe2/operators/rnn/recurrent_network_executor_gpu.h" 3 #include "caffe2/core/context_gpu.h" 8 std::unique_ptr<RecurrentNetworkExecutorBase> createRNNExecutor<CUDAContext>(
9 const NetDef& step_net_def,
10 std::map<string, string>& recurrent_input_map,
11 std::string timestep_blob,
12 ArgumentHelper arg_helper) {
13 auto* exec =
new CUDARecurrentNetworkExecutor(
14 step_net_def, recurrent_input_map, timestep_blob);
15 int max_streams = arg_helper.GetSingleArgument<
int>(
"rnn_executor.max_cuda_streams", 0);
16 if (max_streams > 0) {
17 exec->setMaxStreams(max_streams);
18 LOG(INFO) <<
"Set max streams:" << max_streams;
20 std::unique_ptr<RecurrentNetworkExecutorBase> ptr(exec);
24 CUDARecurrentNetworkExecutor::~CUDARecurrentNetworkExecutor() {
25 for (cudaEvent_t ev : events_) {
27 CUDA_CHECK(cudaEventDestroy(ev));
38 void CUDARecurrentNetworkExecutor::_ExecRange(
int from,
int to) {
39 int direction = to > from ? 1 : -1;
41 int max_streams = max_parallel_timesteps_ > 0 ?
42 std::min(max_parallel_timesteps_, max_cuda_streams_)
45 int num_ops = timestep_ops_[0].size();
47 events_.resize(num_ops * timestep_ops_.size(),
nullptr);
52 for (
int t = from; t != to; t += direction) {
53 bool first_timestep = t == from;
55 (direction == -1 && t == 0) || (direction == 1 && t == to - 1);
56 auto& ops = timestep_ops_[t];
57 int stream_id = stream_seq % max_streams;
59 for (
int i = 0; i < ops.size(); i++) {
60 auto& rnn_op = ops[i];
65 rnn_op.op->RunAsync(stream_id);
67 rnn_op.dependencies.empty(),
68 "GPU executor ignores link dependencies");
72 if (gpu_id == -1 && rnn_op.op->device_option().device_type() == 1) {
73 gpu_id = rnn_op.op->device_option().cuda_gpu_id();
76 rnn_op.op->device_option().device_type() == 0 ||
77 rnn_op.op->device_option().cuda_gpu_id() == gpu_id,
78 "RNN Executor only supports ops on one GPU");
83 if (has_timestep_parallelism_ && !first_timestep) {
84 for (
int parent : rnn_op.parents) {
86 int parent_ev_idx = (t - direction) * num_ops + parent;
87 CHECK(events_.size() > parent_ev_idx);
88 CAFFE_ENFORCE(events_[parent_ev_idx] !=
nullptr);
89 CUDA_CHECK(cudaStreamWaitEvent(
90 CUDAContext::cuda_stream(gpu_id, stream_id),
91 events_[parent_ev_idx],
98 rnn_op.op->RunAsync(stream_id);
102 if (has_timestep_parallelism_ && !last_timestep) {
103 for (
int dep : rnn_op.dependencies) {
105 int event_idx = t * num_ops + i;
107 if (events_[event_idx] ==
nullptr) {
108 CUDA_CHECK(cudaEventCreate(&events_[event_idx]));
110 CUDA_CHECK(cudaEventRecord(
112 CUDAContext::cuda_stream(gpu_id, stream_id)));
120 if (has_timestep_parallelism_) {
128 for (
int stream_id = 0; stream_id <= std::min(stream_seq, max_streams - 1);
130 VLOG(1) <<
"Wait for stream:" << stream_id;
132 cudaStreamSynchronize(CUDAContext::cuda_stream(gpu_id, stream_id)));
136 bool CUDARecurrentNetworkExecutor::Run(
int T) {
141 bool CUDARecurrentNetworkExecutor::RunBackwards(
int T) {
142 _ExecRange(T - 1, -1);
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...