float measure_lstm_time()

in scripts/cnn.h [413:547]


float measure_lstm_time(int numLayers, int seqLength, int batchSize, int inputSize, int outputSize)
{
  cudnnRNNDescriptor_t rnnDesc;
  cudnnDropoutDescriptor_t dropoutDesc;
  cudnnTensorDescriptor_t xDescs[MAX_SEQ_LENGTH], yDescs[MAX_SEQ_LENGTH];
  cudnnTensorDescriptor_t cxDesc, hxDesc, cyDesc, hyDesc;
  float *xData, *yData, *cxData, *cyData, *hxData, *hyData;
  cudnnFilterDescriptor_t wDesc;

  float dropout = 0.2f;
  checkCUDNN(cudnnCreateRNNDescriptor(&rnnDesc));
  checkCUDNN(cudnnCreateDropoutDescriptor(&dropoutDesc));
  size_t dropoutSize;
  void *dropoutStates;
  checkCUDNN(cudnnDropoutGetStatesSize(dnn, &dropoutSize));
  checkCUDA(cudaMalloc(&dropoutStates, dropoutSize));
  checkCUDNN(cudnnSetDropoutDescriptor(dropoutDesc, dnn, dropout, dropoutStates, dropoutSize, 10));
  checkCUDNN(cudnnSetRNNDescriptor_v5(rnnDesc, outputSize, numLayers, dropoutDesc,
                                      CUDNN_LINEAR_INPUT, CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
                                      CUDNN_DATA_FLOAT));
  // Create input descriptors
  for (int i = 0; i < seqLength; i++) {
    checkCUDNN(cudnnCreateTensorDescriptor(&xDescs[i]));
    int dims[] = {batchSize, inputSize, 1};
    int strides[] = {dims[1] * dims[2], dims[2], 1};
    checkCUDNN(cudnnSetTensorNdDescriptor(xDescs[i], CUDNN_DATA_FLOAT, 3, dims, strides));
  }
  checkCUDA(cudaMalloc(&xData, batchSize * outputSize * seqLength * sizeof(float)));
  // Workspace size
  size_t myWSS;
  checkCUDNN(cudnnGetRNNWorkspaceSize(dnn, rnnDesc, seqLength, xDescs, &myWSS));
  // ReserveSpace size
  size_t reserveSpaceSize;
  void* reserveSpace;
  checkCUDNN(cudnnGetRNNTrainingReserveSize(dnn, rnnDesc, seqLength, xDescs, &reserveSpaceSize));
  checkCUDA(cudaMalloc(&reserveSpace, reserveSpaceSize));
  //printf("reserveSpaceSize = %zu\n", reserveSpaceSize);
  // Params Size
  size_t paramsSize;
  void* paramsSpace;
  checkCUDNN(cudnnGetRNNParamsSize(dnn, rnnDesc, xDescs[0], &paramsSize, CUDNN_DATA_FLOAT));
  checkCUDA(cudaMalloc(&paramsSpace, paramsSize));
  //printf("paramsSize = %zu\n", paramsSize);
  // Create weight filter
  {
    int dims[] = {(int)paramsSize, 1, 1};
    checkCUDNN(cudnnCreateFilterDescriptor(&wDesc));
    checkCUDNN(cudnnSetFilterNdDescriptor(wDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dims));
  }
  // Initialize hx, cx, hy, cy
  {
    checkCUDNN(cudnnCreateTensorDescriptor(&hxDesc));
    checkCUDNN(cudnnCreateTensorDescriptor(&cxDesc));
    checkCUDNN(cudnnCreateTensorDescriptor(&hyDesc));
    checkCUDNN(cudnnCreateTensorDescriptor(&cyDesc));
    int dims[] = {numLayers, batchSize, outputSize};
    int strides[] = {dims[1] * dims[2], dims[2], 1};
    checkCUDNN(cudnnSetTensorNdDescriptor(hxDesc, CUDNN_DATA_FLOAT, 3, dims, strides));
    checkCUDNN(cudnnSetTensorNdDescriptor(cxDesc, CUDNN_DATA_FLOAT, 3, dims, strides));
    checkCUDNN(cudnnSetTensorNdDescriptor(hyDesc, CUDNN_DATA_FLOAT, 3, dims, strides));
    checkCUDNN(cudnnSetTensorNdDescriptor(cyDesc, CUDNN_DATA_FLOAT, 3, dims, strides));
    checkCUDA(cudaMalloc(&hxData, dims[0] * dims[1] * dims[2] * sizeof(float)));
    checkCUDA(cudaMalloc(&cxData, dims[0] * dims[1] * dims[2] * sizeof(float)));
    checkCUDA(cudaMalloc(&hyData, dims[0] * dims[1] * dims[2] * sizeof(float)));
    checkCUDA(cudaMalloc(&cyData, dims[0] * dims[1] * dims[2] * sizeof(float)));
  }
  // Initialize yDescs
  for (int i = 0; i < seqLength; i++) {
    checkCUDNN(cudnnCreateTensorDescriptor(&yDescs[i]));
    int dims[] = {batchSize, outputSize, 1};
    int strides[] = {dims[1] * dims[2], dims[2], 1};
    checkCUDNN(cudnnSetTensorNdDescriptor(yDescs[i], CUDNN_DATA_FLOAT, 3, dims, strides));
  }
  checkCUDA(cudaMalloc(&yData, batchSize * outputSize * seqLength * sizeof(float)));

  cudaEvent_t start, stop;
  checkCUDA(cudaEventCreate(&start));
  checkCUDA(cudaEventCreate(&stop));
  checkCUDA(cudaDeviceSynchronize());
  float elapsed = 0;

  for (int i = 0; i < 2*REPEAT_TIMES; i++) {
    // Use the first REPEAT_TIMES to warm up
    if (i==REPEAT_TIMES) {
      checkCUDA(cudaDeviceSynchronize());
      checkCUDA(cudaEventRecord(start));
    }
    checkCUDNN(cudnnRNNForwardTraining(dnn, rnnDesc, seqLength, xDescs, xData, hxDesc, hxData, cxDesc, cxData, wDesc, paramsSpace,
                                       yDescs, yData, hyDesc, hyData, cyDesc, cyData, workSpace, workSpaceSize, reserveSpace, reserveSpaceSize));
  }
  checkCUDA(cudaEventRecord(stop));
  checkCUDA(cudaEventSynchronize(stop));
  cudaEventElapsedTime(&elapsed, start, stop);
  float t1 = elapsed / REPEAT_TIMES;
  // Backward Time
  checkCUDA(cudaDeviceSynchronize());
  for (int i = 0; i < 2*REPEAT_TIMES; i++) {
    if (i == REPEAT_TIMES) {
      checkCUDA(cudaDeviceSynchronize());
      checkCUDA(cudaEventRecord(start));
    }
    checkCUDNN(cudnnRNNBackwardData(dnn, rnnDesc, seqLength, yDescs, yData, yDescs, yData, hyDesc, hyData, cyDesc, cyData, wDesc, paramsSpace,
                                    hxDesc, hxData, cxDesc, cxData, xDescs, xData, hxDesc, hxData, cxDesc, cxData, workSpace, workSpaceSize, reserveSpace, reserveSpaceSize));
    checkCUDNN(cudnnRNNBackwardWeights(dnn, rnnDesc, seqLength, xDescs, xData, hxDesc, hxData, yDescs, yData, workSpace, workSpaceSize, wDesc, paramsSpace,
                                       reserveSpace, reserveSpaceSize));
  }
  checkCUDA(cudaEventRecord(stop));
  checkCUDA(cudaEventSynchronize(stop));
  cudaEventElapsedTime(&elapsed, start, stop);
  float t2 = elapsed / REPEAT_TIMES;
  checkCUDA(cudaEventDestroy(start));
  checkCUDA(cudaEventDestroy(stop));
  
  checkCUDNN(cudnnDestroyRNNDescriptor(rnnDesc));
  checkCUDNN(cudnnDestroyDropoutDescriptor(dropoutDesc));
  checkCUDA(cudaFree(dropoutStates));
  for (int i = 0; i < seqLength; i++) {
    checkCUDNN(cudnnDestroyTensorDescriptor(xDescs[i]));
    checkCUDNN(cudnnDestroyTensorDescriptor(yDescs[i]));
  }
  checkCUDA(cudaFree(xData));
  checkCUDA(cudaFree(yData));
  checkCUDA(cudaFree(reserveSpace));
  checkCUDA(cudaFree(paramsSpace));
  checkCUDNN(cudnnDestroyTensorDescriptor(hxDesc));
  checkCUDNN(cudnnDestroyTensorDescriptor(cxDesc));
  checkCUDNN(cudnnDestroyTensorDescriptor(hyDesc));
  checkCUDNN(cudnnDestroyTensorDescriptor(cyDesc));
  checkCUDA(cudaFree(hxData));
  checkCUDA(cudaFree(cxData));
  checkCUDA(cudaFree(hyData));
  checkCUDA(cudaFree(cyData));
  printf("	LSTM: batch(%d) input(%d) output(%d) t1(%.2lf) t2(%.2lf)\n", batchSize, inputSize, outputSize, t1, t2);
  return t1 + t2;
}