in scripts/cnn.h [413:547]
float measure_lstm_time(int numLayers, int seqLength, int batchSize, int inputSize, int outputSize)
{
cudnnRNNDescriptor_t rnnDesc;
cudnnDropoutDescriptor_t dropoutDesc;
cudnnTensorDescriptor_t xDescs[MAX_SEQ_LENGTH], yDescs[MAX_SEQ_LENGTH];
cudnnTensorDescriptor_t cxDesc, hxDesc, cyDesc, hyDesc;
float *xData, *yData, *cxData, *cyData, *hxData, *hyData;
cudnnFilterDescriptor_t wDesc;
float dropout = 0.2f;
checkCUDNN(cudnnCreateRNNDescriptor(&rnnDesc));
checkCUDNN(cudnnCreateDropoutDescriptor(&dropoutDesc));
size_t dropoutSize;
void *dropoutStates;
checkCUDNN(cudnnDropoutGetStatesSize(dnn, &dropoutSize));
checkCUDA(cudaMalloc(&dropoutStates, dropoutSize));
checkCUDNN(cudnnSetDropoutDescriptor(dropoutDesc, dnn, dropout, dropoutStates, dropoutSize, 10));
checkCUDNN(cudnnSetRNNDescriptor_v5(rnnDesc, outputSize, numLayers, dropoutDesc,
CUDNN_LINEAR_INPUT, CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
CUDNN_DATA_FLOAT));
// Create input descriptors
for (int i = 0; i < seqLength; i++) {
checkCUDNN(cudnnCreateTensorDescriptor(&xDescs[i]));
int dims[] = {batchSize, inputSize, 1};
int strides[] = {dims[1] * dims[2], dims[2], 1};
checkCUDNN(cudnnSetTensorNdDescriptor(xDescs[i], CUDNN_DATA_FLOAT, 3, dims, strides));
}
checkCUDA(cudaMalloc(&xData, batchSize * outputSize * seqLength * sizeof(float)));
// Workspace size
size_t myWSS;
checkCUDNN(cudnnGetRNNWorkspaceSize(dnn, rnnDesc, seqLength, xDescs, &myWSS));
// ReserveSpace size
size_t reserveSpaceSize;
void* reserveSpace;
checkCUDNN(cudnnGetRNNTrainingReserveSize(dnn, rnnDesc, seqLength, xDescs, &reserveSpaceSize));
checkCUDA(cudaMalloc(&reserveSpace, reserveSpaceSize));
//printf("reserveSpaceSize = %zu\n", reserveSpaceSize);
// Params Size
size_t paramsSize;
void* paramsSpace;
checkCUDNN(cudnnGetRNNParamsSize(dnn, rnnDesc, xDescs[0], ¶msSize, CUDNN_DATA_FLOAT));
checkCUDA(cudaMalloc(¶msSpace, paramsSize));
//printf("paramsSize = %zu\n", paramsSize);
// Create weight filter
{
int dims[] = {(int)paramsSize, 1, 1};
checkCUDNN(cudnnCreateFilterDescriptor(&wDesc));
checkCUDNN(cudnnSetFilterNdDescriptor(wDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dims));
}
// Initialize hx, cx, hy, cy
{
checkCUDNN(cudnnCreateTensorDescriptor(&hxDesc));
checkCUDNN(cudnnCreateTensorDescriptor(&cxDesc));
checkCUDNN(cudnnCreateTensorDescriptor(&hyDesc));
checkCUDNN(cudnnCreateTensorDescriptor(&cyDesc));
int dims[] = {numLayers, batchSize, outputSize};
int strides[] = {dims[1] * dims[2], dims[2], 1};
checkCUDNN(cudnnSetTensorNdDescriptor(hxDesc, CUDNN_DATA_FLOAT, 3, dims, strides));
checkCUDNN(cudnnSetTensorNdDescriptor(cxDesc, CUDNN_DATA_FLOAT, 3, dims, strides));
checkCUDNN(cudnnSetTensorNdDescriptor(hyDesc, CUDNN_DATA_FLOAT, 3, dims, strides));
checkCUDNN(cudnnSetTensorNdDescriptor(cyDesc, CUDNN_DATA_FLOAT, 3, dims, strides));
checkCUDA(cudaMalloc(&hxData, dims[0] * dims[1] * dims[2] * sizeof(float)));
checkCUDA(cudaMalloc(&cxData, dims[0] * dims[1] * dims[2] * sizeof(float)));
checkCUDA(cudaMalloc(&hyData, dims[0] * dims[1] * dims[2] * sizeof(float)));
checkCUDA(cudaMalloc(&cyData, dims[0] * dims[1] * dims[2] * sizeof(float)));
}
// Initialize yDescs
for (int i = 0; i < seqLength; i++) {
checkCUDNN(cudnnCreateTensorDescriptor(&yDescs[i]));
int dims[] = {batchSize, outputSize, 1};
int strides[] = {dims[1] * dims[2], dims[2], 1};
checkCUDNN(cudnnSetTensorNdDescriptor(yDescs[i], CUDNN_DATA_FLOAT, 3, dims, strides));
}
checkCUDA(cudaMalloc(&yData, batchSize * outputSize * seqLength * sizeof(float)));
cudaEvent_t start, stop;
checkCUDA(cudaEventCreate(&start));
checkCUDA(cudaEventCreate(&stop));
checkCUDA(cudaDeviceSynchronize());
float elapsed = 0;
for (int i = 0; i < 2*REPEAT_TIMES; i++) {
// Use the first REPEAT_TIMES to warm up
if (i==REPEAT_TIMES) {
checkCUDA(cudaDeviceSynchronize());
checkCUDA(cudaEventRecord(start));
}
checkCUDNN(cudnnRNNForwardTraining(dnn, rnnDesc, seqLength, xDescs, xData, hxDesc, hxData, cxDesc, cxData, wDesc, paramsSpace,
yDescs, yData, hyDesc, hyData, cyDesc, cyData, workSpace, workSpaceSize, reserveSpace, reserveSpaceSize));
}
checkCUDA(cudaEventRecord(stop));
checkCUDA(cudaEventSynchronize(stop));
cudaEventElapsedTime(&elapsed, start, stop);
float t1 = elapsed / REPEAT_TIMES;
// Backward Time
checkCUDA(cudaDeviceSynchronize());
for (int i = 0; i < 2*REPEAT_TIMES; i++) {
if (i == REPEAT_TIMES) {
checkCUDA(cudaDeviceSynchronize());
checkCUDA(cudaEventRecord(start));
}
checkCUDNN(cudnnRNNBackwardData(dnn, rnnDesc, seqLength, yDescs, yData, yDescs, yData, hyDesc, hyData, cyDesc, cyData, wDesc, paramsSpace,
hxDesc, hxData, cxDesc, cxData, xDescs, xData, hxDesc, hxData, cxDesc, cxData, workSpace, workSpaceSize, reserveSpace, reserveSpaceSize));
checkCUDNN(cudnnRNNBackwardWeights(dnn, rnnDesc, seqLength, xDescs, xData, hxDesc, hxData, yDescs, yData, workSpace, workSpaceSize, wDesc, paramsSpace,
reserveSpace, reserveSpaceSize));
}
checkCUDA(cudaEventRecord(stop));
checkCUDA(cudaEventSynchronize(stop));
cudaEventElapsedTime(&elapsed, start, stop);
float t2 = elapsed / REPEAT_TIMES;
checkCUDA(cudaEventDestroy(start));
checkCUDA(cudaEventDestroy(stop));
checkCUDNN(cudnnDestroyRNNDescriptor(rnnDesc));
checkCUDNN(cudnnDestroyDropoutDescriptor(dropoutDesc));
checkCUDA(cudaFree(dropoutStates));
for (int i = 0; i < seqLength; i++) {
checkCUDNN(cudnnDestroyTensorDescriptor(xDescs[i]));
checkCUDNN(cudnnDestroyTensorDescriptor(yDescs[i]));
}
checkCUDA(cudaFree(xData));
checkCUDA(cudaFree(yData));
checkCUDA(cudaFree(reserveSpace));
checkCUDA(cudaFree(paramsSpace));
checkCUDNN(cudnnDestroyTensorDescriptor(hxDesc));
checkCUDNN(cudnnDestroyTensorDescriptor(cxDesc));
checkCUDNN(cudnnDestroyTensorDescriptor(hyDesc));
checkCUDNN(cudnnDestroyTensorDescriptor(cyDesc));
checkCUDA(cudaFree(hxData));
checkCUDA(cudaFree(cxData));
checkCUDA(cudaFree(hyData));
checkCUDA(cudaFree(cyData));
printf(" LSTM: batch(%d) input(%d) output(%d) t1(%.2lf) t2(%.2lf)\n", batchSize, inputSize, outputSize, t1, t2);
return t1 + t2;
}