in src/operator/cudnn_rnn-inl.h [331:580]
inline void Init(mshadow::Stream<gpu> *s,
const std::vector<TBlob> &in_data,
const std::vector<TBlob> &out_data) {
using namespace mshadow;
#if CUDNN_MAJOR >= 5
format_ = CUDNN_TENSOR_NCHW;
#endif
size_t in_expected = param_.lstm_q_ ? 4 : 3;
size_t out_expected = param_.lstm_q_ ? 3 : 2;
if (!param_.state_outputs)
out_expected = 1;
CHECK_EQ(in_data.size(), in_expected);
CHECK_EQ(out_data.size(), out_expected);
if (!init_cudnn_) {
init_cudnn_ = true;
// get input + output tensors
Tensor<gpu, 3, DType> x = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
Tensor<gpu, 1, DType> w = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
param_.seq_length_ = x.shape_[0];
param_.batch_size_ = x.shape_[1];
param_.input_size_ = x.shape_[2];
// Tensor Descriptors
std::vector<cudnnTensorDescriptor_t> x_vec(param_.seq_length_);
std::vector<cudnnTensorDescriptor_t> y_vec(param_.seq_length_);
std::vector<cudnnTensorDescriptor_t> dx_vec(param_.seq_length_);
std::vector<cudnnTensorDescriptor_t> dy_vec(param_.seq_length_);
int dimA[3];
int strideA[3];
for (int i = 0; i < param_.seq_length_; i++) {
CUDNN_CALL(cudnnCreateTensorDescriptor(&x_vec[i]));
CUDNN_CALL(cudnnCreateTensorDescriptor(&y_vec[i]));
CUDNN_CALL(cudnnCreateTensorDescriptor(&dx_vec[i]));
CUDNN_CALL(cudnnCreateTensorDescriptor(&dy_vec[i]));
dimA[0] = param_.batch_size_;
dimA[1] = param_.input_size_;
dimA[2] = 1;
dimA[0] = param_.batch_size_;
dimA[1] = param_.input_size_;
strideA[0] = dimA[2] * dimA[1];
strideA[1] = dimA[2];
strideA[2] = 1;
CUDNN_CALL(cudnnSetTensorNdDescriptor(x_vec[i],
dtype_,
3,
dimA,
strideA));
CUDNN_CALL(cudnnSetTensorNdDescriptor(dx_vec[i],
dtype_,
3,
dimA,
strideA));
dimA[0] = param_.batch_size_;
dimA[1] = param_.bidirectional ? param_.state_size * 2 : param_.state_size;
dimA[2] = 1;
strideA[0] = dimA[2] * dimA[1];
strideA[1] = dimA[2];
strideA[2] = 1;
CUDNN_CALL(cudnnSetTensorNdDescriptor(y_vec[i],
dtype_,
3,
dimA,
strideA));
CUDNN_CALL(cudnnSetTensorNdDescriptor(dy_vec[i],
dtype_,
3,
dimA,
strideA));
}
x_desc_vec_ = x_vec;
y_desc_vec_ = y_vec;
dx_desc_vec_ = dx_vec;
dy_desc_vec_ = dy_vec;
// set the state tensors
dimA[0] = param_.num_layers * (param_.bidirectional ? 2 : 1);
dimA[1] = param_.batch_size_;
dimA[2] = param_.state_size;
strideA[0] = dimA[2] * dimA[1];
strideA[1] = dimA[2];
strideA[2] = 1;
CUDNN_CALL(cudnnSetTensorNdDescriptor(hx_desc_,
dtype_,
3,
dimA,
strideA));
CUDNN_CALL(cudnnSetTensorNdDescriptor(cx_desc_,
dtype_,
3,
dimA,
strideA));
CUDNN_CALL(cudnnSetTensorNdDescriptor(hy_desc_,
dtype_,
3,
dimA,
strideA));
CUDNN_CALL(cudnnSetTensorNdDescriptor(cy_desc_,
dtype_,
3,
dimA,
strideA));
CUDNN_CALL(cudnnSetTensorNdDescriptor(dhx_desc_,
dtype_,
3,
dimA,
strideA));
CUDNN_CALL(cudnnSetTensorNdDescriptor(dcx_desc_,
dtype_,
3,
dimA,
strideA));
CUDNN_CALL(cudnnSetTensorNdDescriptor(dhy_desc_,
dtype_,
3,
dimA,
strideA));
CUDNN_CALL(cudnnSetTensorNdDescriptor(dcy_desc_,
dtype_,
3,
dimA,
strideA));
// Create Dropout descriptors
if (param_.p > 0) {
CUDNN_CALL(cudnnDropoutGetStatesSize(s->dnn_handle_, &dropout_byte_));
dropout_size_ = dropout_byte_ / sizeof(DType);
dropout_states_ = Storage::Get()->Alloc(dropout_byte_, Context::GPU());
} else {
dropout_states_ = {};
dropout_byte_ = 0;
}
CUDNN_CALL(cudnnSetDropoutDescriptor(dropout_desc_, s->dnn_handle_,
param_.p, // discard probability
dropout_states_.dptr, dropout_byte_,
seed_));
// RNN descriptors
#if CUDNN_MAJOR >= 6
cudnnRNNAlgo_t rnn_algo = CUDNN_RNN_ALGO_STANDARD;
CUDNN_CALL(cudnnSetRNNDescriptor_v6(s->dnn_handle_,
rnn_desc_,
param_.state_size,
param_.num_layers,
dropout_desc_,
input_mode_,
direction_,
mode_,
rnn_algo,
dtype_));
#else
CUDNN_CALL(cudnnSetRNNDescriptor(rnn_desc_,
param_.state_size,
param_.num_layers,
dropout_desc_,
input_mode_,
direction_,
mode_,
dtype_));
#endif
#if CUDNN_MAJOR >= 7
cudnnMathType_t math_type = CUDNN_DEFAULT_MATH;
if (cudnn_tensor_core_ && rnn_algo == CUDNN_RNN_ALGO_STANDARD) {
math_type = CUDNN_TENSOR_OP_MATH;
}
#if CUDNN_VERSION >= 7200
if (GetEnvAllowTensorCore() && GetEnvAllowTensorCoreConversion() &&
(DataType<DType>::kFlag != kFloat16))
math_type = CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION;
#endif
CUDNN_CALL(cudnnSetRNNMatrixMathType(rnn_desc_, math_type));
#endif
// Get temp space sizes
CUDNN_CALL(cudnnGetRNNWorkspaceSize(s->dnn_handle_,
rnn_desc_,
param_.seq_length_,
x_desc_vec_.data(),
&workspace_byte_));
CUDNN_CALL(cudnnGetRNNTrainingReserveSize(s->dnn_handle_,
rnn_desc_,
param_.seq_length_,
x_desc_vec_.data(),
&reserve_space_byte_));
workspace_size_ = workspace_byte_ / sizeof(DType);
// Allocate the reserve space
reserve_space_ = Storage::Get()->Alloc(reserve_space_byte_, Context::GPU());
// Check that number of params are correct
size_t cudnn_param_size;
CUDNN_CALL(cudnnGetRNNParamsSize(s->dnn_handle_,
rnn_desc_,
x_desc_vec_[0],
&cudnn_param_size,
dtype_));
CHECK_EQ(w.shape_[0] * sizeof(DType), cudnn_param_size);
// Set param descriptors
int dim_w[3] = {1, 1, 1};
dim_w[0] = w.shape_[0];
CUDNN_CALL(cudnnSetFilterNdDescriptor(w_desc_,
dtype_,
format_,
3,
dim_w));
CUDNN_CALL(cudnnSetFilterNdDescriptor(dw_desc_,
dtype_,
format_,
3,
dim_w));
// Query weight layout
// cudnnFilterDescriptor_t m_desc;
// CHECK_EQ(cudnnCreateFilterDescriptor(&m_desc), CUDNN_STATUS_SUCCESS);
// DType *p;
// int n = 2;
// int64_t last = 0;
// if (param_.mode == rnn_enum::kLstm) n = 8;
// else if (param_.mode == rnn_enum::kGru) n = 6;
// for (int i = 0; i < param_.num_layers*(param_.bidirectional?2:1); ++i) {
// for (int j = 0; j < n; ++j) {
// CHECK_EQ(cudnnGetRNNLinLayerMatrixParams(s->dnn_handle_, rnn_desc_,
// i, x_desc_vec_[0], w_desc_, 0, j, m_desc, (void**)&p), CUDNN_STATUS_SUCCESS);
// LOG(INFO) << ((int64_t)(p - NULL))/sizeof(DType) - last;
// last = ((int64_t)(p - NULL))/sizeof(DType);
// cudnnDataType_t t;
// cudnnTensorFormat_t f;
// int ndim = 5;
// int dims[5] = {0, 0, 0, 0, 0};
// CHECK_EQ(cudnnGetFilterNdDescriptor(m_desc, ndim, &t, &f, &ndim, &dims[0]),
// CUDNN_STATUS_SUCCESS);
// LOG(INFO) << "w: " << i << " " << j << " " << ((int64_t)(p - NULL))/sizeof(DType);
// for (int i = 0; i < ndim; ++i) LOG(INFO) << dims[i];
// }
// }
// for (int i = 0; i < param_.num_layers*(param_.bidirectional?2:1); ++i) {
// for (int j = 0; j < n; ++j) {
// CHECK_EQ(cudnnGetRNNLinLayerBiasParams(s->dnn_handle_, rnn_desc_, i, x_desc_vec_[0],
// w_desc_, 0, j, m_desc, (void**)&p), CUDNN_STATUS_SUCCESS);
// LOG(INFO) << ((int64_t)(p - NULL))/sizeof(DType) - last;
// last = ((int64_t)(p - NULL))/sizeof(DType);
// LOG(INFO) << "b: " << i << " " << j << " " << ((int64_t)(p - NULL))/sizeof(DType);
// }
// }
}
}