in tensorflow/tensorflow/stream_executor/cuda/cuda_blas.cc [540:3226]
bool CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
bool pointer_mode_host, bool err_on_failure,
cublasMath_t math_type, Args... args) {
absl::MutexLock lock(&mu_);
CHECK(blas_ != nullptr);
if (!SetStream(stream)) {
return false;
}
#if CUDA_VERSION >= 9000
ScopedCublasMathMode math_mode{blas_};
#if CUBLAS_VER_MAJOR >= 11
if (math_type == CUBLAS_TF32_TENSOR_OP_MATH &&
tensorflow::tensor_float_32_execution_enabled()) {
#else
if (math_type == CUBLAS_TENSOR_OP_MATH) {
#endif
if (!math_mode.Init(math_type)) {
return false;
}
}
#endif
gpu::ScopedActivateExecutorContext sac{parent_};
ScopedCublasPointerMode pointer_mode{blas_};
if (!pointer_mode.Init(pointer_mode_host ? CUBLAS_POINTER_MODE_HOST
: CUBLAS_POINTER_MODE_DEVICE)) {
return false;
}
cublasStatus_t ret = cublas_func(blas_, args...);
if ((err_on_failure || VLOG_IS_ON(3)) && ret != CUBLAS_STATUS_SUCCESS) {
LOG(ERROR) << "DoBlasInternalImpl: failed to run cuBLAS routine: "
<< ToString(ret);
std::stringstream ss;
PrintArgs(ss, 0, args...);
LOG(ERROR) << "cuBLAS args: ";
LOG(ERROR) << ss.str();
}
return ret == CUBLAS_STATUS_SUCCESS;
}
bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
const DeviceMemory<float> &x, int incx,
DeviceMemory<float> *result) {
return DoBlasInternal(cublasSasum, stream, false /* = pointer_mode_host */,
elem_count, GpuMemory(x), incx,
GpuMemoryMutable(result));
}
bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
const DeviceMemory<double> &x, int incx,
DeviceMemory<double> *result) {
return DoBlasInternal(cublasDasum, stream, false /* = pointer_mode_host */,
elem_count, GpuMemory(x), incx,
GpuMemoryMutable(result));
}
bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
const DeviceMemory<std::complex<float>> &x, int incx,
DeviceMemory<float> *result) {
return DoBlasInternal(cublasScasum, stream, false /* = pointer_mode_host */,
elem_count, GpuComplex(GpuMemory(x)), incx,
GpuMemoryMutable(result));
}
bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
const DeviceMemory<std::complex<double>> &x, int incx,
DeviceMemory<double> *result) {
return DoBlasInternal(cublasDzasum, stream, false /* = pointer_mode_host */,
elem_count, GpuComplex(GpuMemory(x)), incx,
GpuMemoryMutable(result));
}
bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count, float alpha,
const DeviceMemory<float> &x, int incx,
DeviceMemory<float> *y, int incy) {
return DoBlasInternal(cublasSaxpy, stream, true /* = pointer_mode_host */,
elem_count, &alpha, GpuMemory(x), incx,
GpuMemoryMutable(y), incy);
}
bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count, double alpha,
const DeviceMemory<double> &x, int incx,
DeviceMemory<double> *y, int incy) {
return DoBlasInternal(cublasDaxpy, stream, true /* = pointer_mode_host */,
elem_count, &alpha, GpuMemory(x), incx,
GpuMemoryMutable(y), incy);
}
bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count,
std::complex<float> alpha,
const DeviceMemory<std::complex<float>> &x, int incx,
DeviceMemory<std::complex<float>> *y, int incy) {
auto cb_alpha = GpuComplexValue(alpha);
return DoBlasInternal(cublasCaxpy, stream, true /* = pointer_mode_host */,
elem_count, GpuComplex(&cb_alpha),
GpuComplex(GpuMemory(x)), incx,
GpuComplex(GpuMemoryMutable(y)), incy);
}
bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count,
std::complex<double> alpha,
const DeviceMemory<std::complex<double>> &x, int incx,
DeviceMemory<std::complex<double>> *y, int incy) {
auto cb_alpha = GpuComplexValue(alpha);
return DoBlasInternal(cublasZaxpy, stream, true /* = pointer_mode_host */,
elem_count, GpuComplex(&cb_alpha),
GpuComplex(GpuMemory(x)), incx,
GpuComplex(GpuMemoryMutable(y)), incy);
}
bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count,
const DeviceMemory<float> &x, int incx,
DeviceMemory<float> *y, int incy) {
return DoBlasInternal(cublasScopy, stream, true /* = pointer_mode_host */,
elem_count, GpuMemory(x), incx, GpuMemoryMutable(y),
incy);
}
bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count,
const DeviceMemory<double> &x, int incx,
DeviceMemory<double> *y, int incy) {
return DoBlasInternal(cublasDcopy, stream, true /* = pointer_mode_host */,
elem_count, GpuMemory(x), incx, GpuMemoryMutable(y),
incy);
}
bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count,
const DeviceMemory<std::complex<float>> &x, int incx,
DeviceMemory<std::complex<float>> *y, int incy) {
return DoBlasInternal(cublasCcopy, stream, true /* = pointer_mode_host */,
elem_count, GpuComplex(GpuMemory(x)), incx,
GpuComplex(GpuMemoryMutable(y)), incy);
}
bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count,
const DeviceMemory<std::complex<double>> &x, int incx,
DeviceMemory<std::complex<double>> *y, int incy) {
return DoBlasInternal(cublasZcopy, stream, true /* = pointer_mode_host */,
elem_count, GpuComplex(GpuMemory(x)), incx,
GpuComplex(GpuMemoryMutable(y)), incy);
}
bool CUDABlas::DoBlasDot(Stream *stream, uint64 elem_count,
const DeviceMemory<float> &x, int incx,
const DeviceMemory<float> &y, int incy,
DeviceMemory<float> *result) {
return DoBlasInternal(cublasSdot, stream, false /* = pointer_mode_host */,
elem_count, GpuMemory(x), incx, GpuMemory(y), incy,
GpuMemoryMutable(result));
}
bool CUDABlas::DoBlasDot(Stream *stream, uint64 elem_count,
const DeviceMemory<double> &x, int incx,
const DeviceMemory<double> &y, int incy,
DeviceMemory<double> *result) {
return DoBlasInternal(cublasDdot, stream, false /* = pointer_mode_host */,
elem_count, GpuMemory(x), incx, GpuMemory(y), incy,
GpuMemoryMutable(result));
}
bool CUDABlas::DoBlasDotc(Stream *stream, uint64 elem_count,
const DeviceMemory<std::complex<float>> &x, int incx,
const DeviceMemory<std::complex<float>> &y, int incy,
DeviceMemory<std::complex<float>> *result) {
return DoBlasInternal(cublasCdotc, stream, false /* = pointer_mode_host */,
elem_count, GpuComplex(GpuMemory(x)), incx,
GpuComplex(GpuMemory(y)), incy,
GpuComplex(GpuMemoryMutable(result)));
}
bool CUDABlas::DoBlasDotc(Stream *stream, uint64 elem_count,
const DeviceMemory<std::complex<double>> &x, int incx,
const DeviceMemory<std::complex<double>> &y, int incy,
DeviceMemory<std::complex<double>> *result) {
return DoBlasInternal(cublasZdotc, stream, false /* = pointer_mode_host */,
elem_count, GpuComplex(GpuMemory(x)), incx,
GpuComplex(GpuMemory(y)), incy,
GpuComplex(GpuMemoryMutable(result)));
}
bool CUDABlas::DoBlasDotu(Stream *stream, uint64 elem_count,
const DeviceMemory<std::complex<float>> &x, int incx,
const DeviceMemory<std::complex<float>> &y, int incy,
DeviceMemory<std::complex<float>> *result) {
return DoBlasInternal(cublasCdotu, stream, false /* = pointer_mode_host */,
elem_count, GpuComplex(GpuMemory(x)), incx,
GpuComplex(GpuMemory(y)), incy,
GpuComplex(GpuMemoryMutable(result)));
}
bool CUDABlas::DoBlasDotu(Stream *stream, uint64 elem_count,
const DeviceMemory<std::complex<double>> &x, int incx,
const DeviceMemory<std::complex<double>> &y, int incy,
DeviceMemory<std::complex<double>> *result) {
return DoBlasInternal(cublasZdotu, stream, false /* = pointer_mode_host */,
elem_count, GpuComplex(GpuMemory(x)), incx,
GpuComplex(GpuMemory(y)), incy,
GpuComplex(GpuMemoryMutable(result)));
}
bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
const DeviceMemory<float> &x, int incx,
DeviceMemory<float> *result) {
return DoBlasInternal(cublasSnrm2, stream, false /* = pointer_mode_host */,
elem_count, GpuMemory(x), incx,
GpuMemoryMutable(result));
}
bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
const DeviceMemory<double> &x, int incx,
DeviceMemory<double> *result) {
return DoBlasInternal(cublasDnrm2, stream, false /* = pointer_mode_host */,
elem_count, GpuMemory(x), incx,
GpuMemoryMutable(result));
}
bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
const DeviceMemory<std::complex<float>> &x, int incx,
DeviceMemory<float> *result) {
return DoBlasInternal(cublasScnrm2, stream, false /* = pointer_mode_host */,
elem_count, GpuComplex(GpuMemory(x)), incx,
GpuMemoryMutable(result));
}
bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
const DeviceMemory<std::complex<double>> &x, int incx,
DeviceMemory<double> *result) {
return DoBlasInternal(cublasDznrm2, stream, false /* = pointer_mode_host */,
elem_count, GpuComplex(GpuMemory(x)), incx,
GpuMemoryMutable(result));
}
bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
DeviceMemory<float> *x, int incx,
DeviceMemory<float> *y, int incy, float c, float s) {
return DoBlasInternal(cublasSrot, stream, true /* = pointer_mode_host */,
elem_count, GpuMemoryMutable(x), incx,
GpuMemoryMutable(y), incy, &c, &s);
}
bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
DeviceMemory<double> *x, int incx,
DeviceMemory<double> *y, int incy, double c,
double s) {
return DoBlasInternal(cublasDrot, stream, true /* = pointer_mode_host */,
elem_count, GpuMemoryMutable(x), incx,
GpuMemoryMutable(y), incy, &c, &s);
}
bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
DeviceMemory<std::complex<float>> *x, int incx,
DeviceMemory<std::complex<float>> *y, int incy,
float c, float s) {
return DoBlasInternal(cublasCsrot, stream, true /* = pointer_mode_host */,
elem_count, GpuComplex(GpuMemoryMutable(x)), incx,
GpuComplex(GpuMemoryMutable(y)), incy, &c, &s);
}
bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
DeviceMemory<std::complex<double>> *x, int incx,
DeviceMemory<std::complex<double>> *y, int incy,
double c, double s) {
return DoBlasInternal(cublasZdrot, stream, true /* = pointer_mode_host */,
elem_count, GpuComplex(GpuMemoryMutable(x)), incx,
GpuComplex(GpuMemoryMutable(y)), incy, &c, &s);
}
bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<float> *a,
DeviceMemory<float> *b, DeviceMemory<float> *c,
DeviceMemory<float> *s) {
return DoBlasInternal(cublasSrotg, stream, false /* = pointer_mode_host */,
GpuMemoryMutable(a), GpuMemoryMutable(b),
GpuMemoryMutable(c), GpuMemoryMutable(s));
}
bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<double> *a,
DeviceMemory<double> *b, DeviceMemory<double> *c,
DeviceMemory<double> *s) {
return DoBlasInternal(cublasDrotg, stream, false /* = pointer_mode_host */,
GpuComplex(GpuMemoryMutable(a)), GpuMemoryMutable(b),
GpuMemoryMutable(c), GpuMemoryMutable(s));
}
bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<std::complex<float>> *a,
DeviceMemory<std::complex<float>> *b,
DeviceMemory<float> *c,
DeviceMemory<std::complex<float>> *s) {
return DoBlasInternal(
cublasCrotg, stream, false /* = pointer_mode_host */,
GpuComplex(GpuMemoryMutable(a)), GpuComplex(GpuMemoryMutable(b)),
GpuComplex(GpuMemoryMutable(c)), GpuComplex(GpuMemoryMutable(s)));
}
bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<std::complex<double>> *a,
DeviceMemory<std::complex<double>> *b,
DeviceMemory<double> *c,
DeviceMemory<std::complex<double>> *s) {
return DoBlasInternal(
cublasZrotg, stream, false /* = pointer_mode_host */,
GpuComplex(GpuMemoryMutable(a)), GpuComplex(GpuMemoryMutable(b)),
GpuComplex(GpuMemoryMutable(c)), GpuComplex(GpuMemoryMutable(s)));
}
bool CUDABlas::DoBlasRotm(Stream *stream, uint64 elem_count,
DeviceMemory<float> *x, int incx,
DeviceMemory<float> *y, int incy,
const DeviceMemory<float> ¶m) {
return DoBlasInternal(cublasSrotm, stream, false /* = pointer_mode_host */,
elem_count, GpuMemoryMutable(x), incx,
GpuMemoryMutable(y), incy, GpuMemory(param));
}
bool CUDABlas::DoBlasRotm(Stream *stream, uint64 elem_count,
DeviceMemory<double> *x, int incx,
DeviceMemory<double> *y, int incy,
const DeviceMemory<double> ¶m) {
return DoBlasInternal(cublasDrotm, stream, false /* = pointer_mode_host */,
elem_count, GpuMemoryMutable(x), incx,
GpuMemoryMutable(y), incy, GpuMemory(param));
}
bool CUDABlas::DoBlasRotmg(Stream *stream, DeviceMemory<float> *d1,
DeviceMemory<float> *d2, DeviceMemory<float> *x1,
const DeviceMemory<float> &y1,
DeviceMemory<float> *param) {
return DoBlasInternal(cublasSrotmg, stream, false /* = pointer_mode_host */,
GpuMemoryMutable(d1), GpuMemoryMutable(d2),
GpuMemoryMutable(x1), GpuMemory(y1),
GpuMemoryMutable(param));
}
bool CUDABlas::DoBlasRotmg(Stream *stream, DeviceMemory<double> *d1,
DeviceMemory<double> *d2, DeviceMemory<double> *x1,
const DeviceMemory<double> &y1,
DeviceMemory<double> *param) {
return DoBlasInternal(cublasDrotmg, stream, false /* = pointer_mode_host */,
GpuMemoryMutable(d1), GpuMemoryMutable(d2),
GpuMemoryMutable(x1), GpuMemory(y1),
GpuMemoryMutable(param));
}
bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, float alpha,
DeviceMemory<float> *x, int incx) {
return DoBlasInternal(cublasSscal, stream, true /* = pointer_mode_host */,
elem_count, &alpha, GpuMemoryMutable(x), incx);
}
bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, double alpha,
DeviceMemory<double> *x, int incx) {
return DoBlasInternal(cublasDscal, stream, true /* = pointer_mode_host */,
elem_count, &alpha, GpuMemoryMutable(x), incx);
}
bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, float alpha,
DeviceMemory<std::complex<float>> *x, int incx) {
return DoBlasInternal(cublasCsscal, stream, true /* = pointer_mode_host */,
elem_count, &alpha, GpuComplex(GpuMemoryMutable(x)),
incx);
}
bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, double alpha,
DeviceMemory<std::complex<double>> *x, int incx) {
return DoBlasInternal(cublasZdscal, stream, true /* = pointer_mode_host */,
elem_count, &alpha, GpuComplex(GpuMemoryMutable(x)),
incx);
}
bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count,
std::complex<float> alpha,
DeviceMemory<std::complex<float>> *x, int incx) {
auto cb_alpha = GpuComplexValue(alpha);
return DoBlasInternal(cublasCscal, stream, true /* = pointer_mode_host */,
elem_count, GpuComplex(&cb_alpha),
GpuComplex(GpuMemoryMutable(x)), incx);
}
bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count,
std::complex<double> alpha,
DeviceMemory<std::complex<double>> *x, int incx) {
auto cb_alpha = GpuComplexValue(alpha);
return DoBlasInternal(cublasZscal, stream, true /* = pointer_mode_host */,
elem_count, GpuComplex(&cb_alpha),
GpuComplex(GpuMemoryMutable(x)), incx);
}
bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count,
DeviceMemory<float> *x, int incx,
DeviceMemory<float> *y, int incy) {
return DoBlasInternal(cublasSswap, stream, true /* = pointer_mode_host */,
elem_count, GpuMemoryMutable(x), incx,
GpuMemoryMutable(y), incy);
}
bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count,
DeviceMemory<double> *x, int incx,
DeviceMemory<double> *y, int incy) {
return DoBlasInternal(cublasDswap, stream, true /* = pointer_mode_host */,
elem_count, GpuMemoryMutable(x), incx,
GpuMemoryMutable(y), incy);
}
bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count,
DeviceMemory<std::complex<float>> *x, int incx,
DeviceMemory<std::complex<float>> *y, int incy) {
return DoBlasInternal(cublasCswap, stream, true /* = pointer_mode_host */,
elem_count, GpuComplex(GpuMemoryMutable(x)), incx,
GpuComplex(GpuMemoryMutable(y)), incy);
}
bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count,
DeviceMemory<std::complex<double>> *x, int incx,
DeviceMemory<std::complex<double>> *y, int incy) {
return DoBlasInternal(cublasZswap, stream, true /* = pointer_mode_host */,
elem_count, GpuComplex(GpuMemoryMutable(x)), incx,
GpuComplex(GpuMemoryMutable(y)), incy);
}
bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
const DeviceMemory<float> &x, int incx,
DeviceMemory<int> *result) {
return DoBlasInternal(cublasIsamax, stream, false /* = pointer_mode_host */,
elem_count, GpuMemory(x), incx,
GpuMemoryMutable(result));
}
bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
const DeviceMemory<double> &x, int incx,
DeviceMemory<int> *result) {
return DoBlasInternal(cublasIdamax, stream, false /* = pointer_mode_host */,
elem_count, GpuMemory(x), incx,
GpuMemoryMutable(result));
}
bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
const DeviceMemory<std::complex<float>> &x, int incx,
DeviceMemory<int> *result) {
return DoBlasInternal(cublasIcamax, stream, false /* = pointer_mode_host */,
elem_count, GpuComplex(GpuMemory(x)), incx,
GpuMemoryMutable(result));
}
bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
const DeviceMemory<std::complex<double>> &x,
int incx, DeviceMemory<int> *result) {
return DoBlasInternal(cublasIzamax, stream, false /* = pointer_mode_host */,
elem_count, GpuComplex(GpuMemory(x)), incx,
GpuMemoryMutable(result));
}
bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
const DeviceMemory<float> &x, int incx,
DeviceMemory<int> *result) {
return DoBlasInternal(cublasIsamin, stream, false /* = pointer_mode_host */,
elem_count, GpuComplex(GpuMemory(x)), incx,
GpuMemoryMutable(result));
}
bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
const DeviceMemory<double> &x, int incx,
DeviceMemory<int> *result) {
return DoBlasInternal(cublasIdamin, stream, false /* = pointer_mode_host */,
elem_count, GpuComplex(GpuMemory(x)), incx,
GpuMemoryMutable(result));
}
bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
const DeviceMemory<std::complex<float>> &x, int incx,
DeviceMemory<int> *result) {
return DoBlasInternal(cublasIcamin, stream, false /* = pointer_mode_host */,
elem_count, GpuComplex(GpuMemory(x)), incx,
GpuMemoryMutable(result));
}
bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
const DeviceMemory<std::complex<double>> &x,
int incx, DeviceMemory<int> *result) {
return DoBlasInternal(cublasIzamin, stream, false /* = pointer_mode_host */,
elem_count, GpuComplex(GpuMemory(x)), incx,
GpuMemoryMutable(result));
}
bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
uint64 n, uint64 kl, uint64 ku, float alpha,
const DeviceMemory<float> &a, int lda,
const DeviceMemory<float> &x, int incx, float beta,
DeviceMemory<float> *y, int incy) {
return DoBlasInternal(cublasSgbmv, stream, true /* = pointer_mode_host */,
CUDABlasTranspose(trans), m, n, kl, ku, &alpha,
GpuMemory(a), lda, GpuMemory(x), incx, &beta,
GpuMemoryMutable(y), incy);
}
bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
uint64 n, uint64 kl, uint64 ku, double alpha,
const DeviceMemory<double> &a, int lda,
const DeviceMemory<double> &x, int incx, double beta,
DeviceMemory<double> *y, int incy) {
return DoBlasInternal(cublasDgbmv, stream, true /* = pointer_mode_host */,
CUDABlasTranspose(trans), m, n, kl, ku, &alpha,
GpuMemory(a), lda, GpuMemory(x), incx, &beta,
GpuMemoryMutable(y), incy);
}
bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
uint64 n, uint64 kl, uint64 ku,
std::complex<float> alpha,
const DeviceMemory<std::complex<float>> &a, int lda,
const DeviceMemory<std::complex<float>> &x, int incx,
std::complex<float> beta,
DeviceMemory<std::complex<float>> *y, int incy) {
auto cb_alpha = GpuComplexValue(alpha);
auto cb_beta = GpuComplexValue(beta);
return DoBlasInternal(cublasCgbmv, stream, true /* = pointer_mode_host */,
CUDABlasTranspose(trans), m, n, kl, ku,
GpuComplex(&cb_alpha), GpuComplex(GpuMemory(a)), lda,
GpuComplex(GpuMemory(x)), incx, GpuComplex(&cb_beta),
GpuComplex(GpuMemoryMutable(y)), incy);
}
bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
uint64 n, uint64 kl, uint64 ku,
std::complex<double> alpha,
const DeviceMemory<std::complex<double>> &a, int lda,
const DeviceMemory<std::complex<double>> &x, int incx,
std::complex<double> beta,
DeviceMemory<std::complex<double>> *y, int incy) {
auto cb_alpha = GpuComplexValue(alpha);
auto cb_beta = GpuComplexValue(beta);
return DoBlasInternal(cublasZgbmv, stream, true /* = pointer_mode_host */,
CUDABlasTranspose(trans), m, n, kl, ku,
GpuComplex(&cb_alpha), GpuComplex(GpuMemory(a)), lda,
GpuComplex(GpuMemory(x)), incx, GpuComplex(&cb_beta),
GpuComplex(GpuMemoryMutable(y)), incy);
}
bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
uint64 n, float alpha, const DeviceMemory<float> &a,
int lda, const DeviceMemory<float> &x, int incx,
float beta, DeviceMemory<float> *y, int incy) {
return DoBlasInternal(cublasSgemv, stream, true /* = pointer_mode_host */,
CUDABlasTranspose(trans), m, n, &alpha, GpuMemory(a),
lda, GpuMemory(x), incx, &beta, GpuMemoryMutable(y),
incy);
}
bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
uint64 n, double alpha, const DeviceMemory<double> &a,
int lda, const DeviceMemory<double> &x, int incx,
double beta, DeviceMemory<double> *y, int incy) {
return DoBlasInternal(cublasDgemv, stream, true /* = pointer_mode_host */,
CUDABlasTranspose(trans), m, n, &alpha, GpuMemory(a),
lda, GpuMemory(x), incx, &beta, GpuMemoryMutable(y),
incy);
}
bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
uint64 n, std::complex<float> alpha,
const DeviceMemory<std::complex<float>> &a, int lda,
const DeviceMemory<std::complex<float>> &x, int incx,
std::complex<float> beta,
DeviceMemory<std::complex<float>> *y, int incy) {
auto cb_alpha = GpuComplexValue(alpha);
auto cb_beta = GpuComplexValue(beta);
return DoBlasInternal(cublasCgemv, stream, true /* = pointer_mode_host */,
CUDABlasTranspose(trans), m, n, GpuComplex(&cb_alpha),
GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)),
incx, GpuComplex(&cb_beta),
GpuComplex(GpuMemoryMutable(y)), incy);
}
bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
uint64 n, std::complex<double> alpha,
const DeviceMemory<std::complex<double>> &a, int lda,
const DeviceMemory<std::complex<double>> &x, int incx,
std::complex<double> beta,
DeviceMemory<std::complex<double>> *y, int incy) {
auto cb_alpha = GpuComplexValue(alpha);
auto cb_beta = GpuComplexValue(beta);
return DoBlasInternal(cublasZgemv, stream, true /* = pointer_mode_host */,
CUDABlasTranspose(trans), m, n, GpuComplex(&cb_alpha),
GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)),
incx, GpuComplex(&cb_beta),
GpuComplex(GpuMemoryMutable(y)), incy);
}
bool CUDABlas::DoBlasGer(Stream *stream, uint64 m, uint64 n, float alpha,
const DeviceMemory<float> &x, int incx,
const DeviceMemory<float> &y, int incy,
DeviceMemory<float> *a, int lda) {
return DoBlasInternal(cublasSger, stream, true /* = pointer_mode_host */, m,
n, &alpha, GpuMemory(x), incx, GpuMemory(y), incy,
GpuMemoryMutable(a), lda);
}
bool CUDABlas::DoBlasGer(Stream *stream, uint64 m, uint64 n, double alpha,
const DeviceMemory<double> &x, int incx,
const DeviceMemory<double> &y, int incy,
DeviceMemory<double> *a, int lda) {
return DoBlasInternal(cublasDger, stream, true /* = pointer_mode_host */, m,
n, &alpha, GpuMemory(x), incx, GpuMemory(y), incy,
GpuMemoryMutable(a), lda);
}
bool CUDABlas::DoBlasGerc(Stream *stream, uint64 m, uint64 n,
std::complex<float> alpha,
const DeviceMemory<std::complex<float>> &x, int incx,
const DeviceMemory<std::complex<float>> &y, int incy,
DeviceMemory<std::complex<float>> *a, int lda) {
auto cb_alpha = GpuComplexValue(alpha);
return DoBlasInternal(cublasCgerc, stream, true /* = pointer_mode_host */, m,
n, GpuComplex(&cb_alpha), GpuComplex(GpuMemory(x)),
incx, GpuComplex(GpuMemory(y)), incy,
GpuComplex(GpuMemoryMutable(a)), lda);
}
bool CUDABlas::DoBlasGerc(Stream *stream, uint64 m, uint64 n,
std::complex<double> alpha,
const DeviceMemory<std::complex<double>> &x, int incx,
const DeviceMemory<std::complex<double>> &y, int incy,
DeviceMemory<std::complex<double>> *a, int lda) {
auto cb_alpha = GpuComplexValue(alpha);
return DoBlasInternal(cublasZgerc, stream, true /* = pointer_mode_host */, m,
n, GpuComplex(&cb_alpha), GpuComplex(GpuMemory(x)),
incx, GpuComplex(GpuMemory(y)), incy,
GpuComplex(GpuMemoryMutable(a)), lda);
}
bool CUDABlas::DoBlasGeru(Stream *stream, uint64 m, uint64 n,
std::complex<float> alpha,
const DeviceMemory<std::complex<float>> &x, int incx,
const DeviceMemory<std::complex<float>> &y, int incy,
DeviceMemory<std::complex<float>> *a, int lda) {
auto cb_alpha = GpuComplexValue(alpha);
return DoBlasInternal(cublasCgeru, stream, true /* = pointer_mode_host */, m,
n, GpuComplex(&cb_alpha), GpuComplex(GpuMemory(x)),
incx, GpuComplex(GpuMemory(y)), incy,
GpuComplex(GpuMemoryMutable(a)), lda);
}
bool CUDABlas::DoBlasGeru(Stream *stream, uint64 m, uint64 n,
std::complex<double> alpha,
const DeviceMemory<std::complex<double>> &x, int incx,
const DeviceMemory<std::complex<double>> &y, int incy,
DeviceMemory<std::complex<double>> *a, int lda) {
auto cb_alpha = GpuComplexValue(alpha);
return DoBlasInternal(cublasZgeru, stream, true /* = pointer_mode_host */, m,
n, GpuComplex(&cb_alpha), GpuComplex(GpuMemory(x)),
incx, GpuComplex(GpuMemory(y)), incy,
GpuComplex(GpuMemoryMutable(a)), lda);
}
bool CUDABlas::DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
uint64 k, std::complex<float> alpha,
const DeviceMemory<std::complex<float>> &a, int lda,
const DeviceMemory<std::complex<float>> &x, int incx,
std::complex<float> beta,
DeviceMemory<std::complex<float>> *y, int incy) {
auto cb_alpha = GpuComplexValue(alpha);
auto cb_beta = GpuComplexValue(beta);
return DoBlasInternal(cublasChbmv, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), n, k, GpuComplex(&cb_alpha),
GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)),
incx, GpuComplex(&cb_beta),
GpuComplex(GpuMemoryMutable(y)), incy);
}
bool CUDABlas::DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
uint64 k, std::complex<double> alpha,
const DeviceMemory<std::complex<double>> &a, int lda,
const DeviceMemory<std::complex<double>> &x, int incx,
std::complex<double> beta,
DeviceMemory<std::complex<double>> *y, int incy) {
auto cb_alpha = GpuComplexValue(alpha);
auto cb_beta = GpuComplexValue(beta);
return DoBlasInternal(cublasZhbmv, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), n, k, GpuComplex(&cb_alpha),
GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)),
incx, GpuComplex(&cb_beta),
GpuComplex(GpuMemoryMutable(y)), incy);
}
bool CUDABlas::DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n,
std::complex<float> alpha,
const DeviceMemory<std::complex<float>> &a, int lda,
const DeviceMemory<std::complex<float>> &x, int incx,
std::complex<float> beta,
DeviceMemory<std::complex<float>> *y, int incy) {
auto cb_alpha = GpuComplexValue(alpha);
auto cb_beta = GpuComplexValue(beta);
return DoBlasInternal(cublasChemv, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), n, GpuComplex(&cb_alpha),
GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)),
incx, GpuComplex(&cb_beta),
GpuComplex(GpuMemoryMutable(y)), incy);
}
bool CUDABlas::DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n,
std::complex<double> alpha,
const DeviceMemory<std::complex<double>> &a, int lda,
const DeviceMemory<std::complex<double>> &x, int incx,
std::complex<double> beta,
DeviceMemory<std::complex<double>> *y, int incy) {
auto cb_alpha = GpuComplexValue(alpha);
auto cb_beta = GpuComplexValue(beta);
return DoBlasInternal(cublasZhemv, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), n, GpuComplex(&cb_alpha),
GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)),
incx, GpuComplex(&cb_beta),
GpuComplex(GpuMemoryMutable(y)), incy);
}
bool CUDABlas::DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n,
float alpha,
const DeviceMemory<std::complex<float>> &x, int incx,
DeviceMemory<std::complex<float>> *a, int lda) {
return DoBlasInternal(cublasCher, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), n, &alpha,
GpuComplex(GpuMemory(x)), incx,
GpuComplex(GpuMemoryMutable(a)), lda);
}
bool CUDABlas::DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n,
double alpha,
const DeviceMemory<std::complex<double>> &x, int incx,
DeviceMemory<std::complex<double>> *a, int lda) {
return DoBlasInternal(cublasZher, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), n, &alpha,
GpuComplex(GpuMemory(x)), incx,
GpuComplex(GpuMemoryMutable(a)), lda);
}
bool CUDABlas::DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n,
std::complex<float> alpha,
const DeviceMemory<std::complex<float>> &x, int incx,
const DeviceMemory<std::complex<float>> &y, int incy,
DeviceMemory<std::complex<float>> *a, int lda) {
auto cb_alpha = GpuComplexValue(alpha);
return DoBlasInternal(cublasCher2, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), n, GpuComplex(&cb_alpha),
GpuComplex(GpuMemory(x)), incx,
GpuComplex(GpuMemory(y)), incy,
GpuComplex(GpuMemoryMutable(a)), lda);
}
bool CUDABlas::DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n,
std::complex<double> alpha,
const DeviceMemory<std::complex<double>> &x, int incx,
const DeviceMemory<std::complex<double>> &y, int incy,
DeviceMemory<std::complex<double>> *a, int lda) {
auto cb_alpha = GpuComplexValue(alpha);
return DoBlasInternal(cublasZher2, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), n, GpuComplex(&cb_alpha),
GpuComplex(GpuMemory(x)), incx,
GpuComplex(GpuMemory(y)), incy,
GpuComplex(GpuMemoryMutable(a)), lda);
}
bool CUDABlas::DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
std::complex<float> alpha,
const DeviceMemory<std::complex<float>> &ap,
const DeviceMemory<std::complex<float>> &x, int incx,
std::complex<float> beta,
DeviceMemory<std::complex<float>> *y, int incy) {
auto cb_alpha = GpuComplexValue(alpha);
auto cb_beta = GpuComplexValue(beta);
return DoBlasInternal(cublasChpmv, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), n, GpuComplex(&cb_alpha),
GpuComplex(GpuMemory(ap)), GpuComplex(GpuMemory(x)),
incx, GpuComplex(&cb_beta),
GpuComplex(GpuMemoryMutable(y)), incy);
}
bool CUDABlas::DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
std::complex<double> alpha,
const DeviceMemory<std::complex<double>> &ap,
const DeviceMemory<std::complex<double>> &x, int incx,
std::complex<double> beta,
DeviceMemory<std::complex<double>> *y, int incy) {
auto cb_alpha = GpuComplexValue(alpha);
auto cb_beta = GpuComplexValue(beta);
return DoBlasInternal(cublasZhpmv, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), n, GpuComplex(&cb_alpha),
GpuComplex(GpuMemory(ap)), GpuComplex(GpuMemory(x)),
incx, GpuComplex(&cb_beta),
GpuComplex(GpuMemoryMutable(y)), incy);
}
bool CUDABlas::DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n,
float alpha,
const DeviceMemory<std::complex<float>> &x, int incx,
DeviceMemory<std::complex<float>> *ap) {
return DoBlasInternal(cublasChpr, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), n, &alpha,
GpuComplex(GpuMemory(x)), incx,
GpuComplex(GpuMemoryMutable(ap)));
}
bool CUDABlas::DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n,
double alpha,
const DeviceMemory<std::complex<double>> &x, int incx,
DeviceMemory<std::complex<double>> *ap) {
return DoBlasInternal(cublasZhpr, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), n, &alpha,
GpuComplex(GpuMemory(x)), incx,
GpuComplex(GpuMemoryMutable(ap)));
}
bool CUDABlas::DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
std::complex<float> alpha,
const DeviceMemory<std::complex<float>> &x, int incx,
const DeviceMemory<std::complex<float>> &y, int incy,
DeviceMemory<std::complex<float>> *ap) {
auto cb_alpha = GpuComplexValue(alpha);
return DoBlasInternal(cublasChpr2, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), n, GpuComplex(&cb_alpha),
GpuComplex(GpuMemory(x)), incx,
GpuComplex(GpuMemory(y)), incy,
GpuComplex(GpuMemoryMutable(ap)));
}
bool CUDABlas::DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
std::complex<double> alpha,
const DeviceMemory<std::complex<double>> &x, int incx,
const DeviceMemory<std::complex<double>> &y, int incy,
DeviceMemory<std::complex<double>> *ap) {
auto cb_alpha = GpuComplexValue(alpha);
return DoBlasInternal(cublasZhpr2, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), n, GpuComplex(&cb_alpha),
GpuComplex(GpuMemory(x)), incx,
GpuComplex(GpuMemory(y)), incy,
GpuComplex(GpuMemoryMutable(ap)));
}
bool CUDABlas::DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
uint64 k, float alpha, const DeviceMemory<float> &a,
int lda, const DeviceMemory<float> &x, int incx,
float beta, DeviceMemory<float> *y, int incy) {
return DoBlasInternal(cublasSsbmv, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), n, k, &alpha, GpuMemory(a),
lda, GpuMemory(x), incx, &beta, GpuMemoryMutable(y),
incy);
}
bool CUDABlas::DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
uint64 k, double alpha, const DeviceMemory<double> &a,
int lda, const DeviceMemory<double> &x, int incx,
double beta, DeviceMemory<double> *y, int incy) {
return DoBlasInternal(cublasDsbmv, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), n, k, &alpha, GpuMemory(a),
lda, GpuMemory(x), incx, &beta, GpuMemoryMutable(y),
incy);
}
bool CUDABlas::DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
float alpha, const DeviceMemory<float> &ap,
const DeviceMemory<float> &x, int incx, float beta,
DeviceMemory<float> *y, int incy) {
return DoBlasInternal(cublasSspmv, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(ap),
GpuMemory(x), incx, &beta, GpuMemoryMutable(y), incy);
}
bool CUDABlas::DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
double alpha, const DeviceMemory<double> &ap,
const DeviceMemory<double> &x, int incx, double beta,
DeviceMemory<double> *y, int incy) {
return DoBlasInternal(cublasDspmv, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(ap),
GpuMemory(x), incx, &beta, GpuMemoryMutable(y), incy);
}
bool CUDABlas::DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n,
float alpha, const DeviceMemory<float> &x, int incx,
DeviceMemory<float> *ap) {
return DoBlasInternal(cublasSspr, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx,
GpuMemoryMutable(ap));
}
bool CUDABlas::DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n,
double alpha, const DeviceMemory<double> &x, int incx,
DeviceMemory<double> *ap) {
return DoBlasInternal(cublasDspr, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx,
GpuMemoryMutable(ap));
}
bool CUDABlas::DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
float alpha, const DeviceMemory<float> &x, int incx,
const DeviceMemory<float> &y, int incy,
DeviceMemory<float> *ap) {
return DoBlasInternal(cublasSspr2, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx,
GpuMemory(y), incy, GpuMemoryMutable(ap));
}
bool CUDABlas::DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
double alpha, const DeviceMemory<double> &x, int incx,
const DeviceMemory<double> &y, int incy,
DeviceMemory<double> *ap) {
return DoBlasInternal(cublasDspr2, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx,
GpuMemory(y), incy, GpuMemoryMutable(ap));
}
bool CUDABlas::DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n,
float alpha, const DeviceMemory<float> &a, int lda,
const DeviceMemory<float> &x, int incx, float beta,
DeviceMemory<float> *y, int incy) {
return DoBlasInternal(cublasSsymv, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(a), lda,
GpuMemory(x), incx, &beta, GpuMemoryMutable(y), incy);
}
bool CUDABlas::DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n,
double alpha, const DeviceMemory<double> &a, int lda,
const DeviceMemory<double> &x, int incx, double beta,
DeviceMemory<double> *y, int incy) {
return DoBlasInternal(cublasDsymv, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(a), lda,
GpuMemory(x), incx, &beta, GpuMemoryMutable(y), incy);
}
bool CUDABlas::DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n,
float alpha, const DeviceMemory<float> &x, int incx,
DeviceMemory<float> *a, int lda) {
return DoBlasInternal(cublasSsyr, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx,
GpuMemoryMutable(a), lda);
}
bool CUDABlas::DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n,
double alpha, const DeviceMemory<double> &x, int incx,
DeviceMemory<double> *a, int lda) {
return DoBlasInternal(cublasDsyr, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx,
GpuMemoryMutable(a), lda);
}
bool CUDABlas::DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n,
float alpha, const DeviceMemory<float> &x, int incx,
const DeviceMemory<float> &y, int incy,
DeviceMemory<float> *a, int lda) {
return DoBlasInternal(cublasSsyr2, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx,
GpuMemory(y), incy, GpuMemoryMutable(a), lda);
}
bool CUDABlas::DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n,
double alpha, const DeviceMemory<double> &x, int incx,
const DeviceMemory<double> &y, int incy,
DeviceMemory<double> *a, int lda) {
return DoBlasInternal(cublasDsyr2, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx,
GpuMemory(y), incy, GpuMemoryMutable(a), lda);
}
bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
blas::Transpose trans, blas::Diagonal diag, uint64 n,
uint64 k, const DeviceMemory<float> &a, int lda,
DeviceMemory<float> *x, int incx) {
return DoBlasInternal(cublasStbmv, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
CUDABlasDiagonal(diag), n, k, GpuMemory(a), lda,
GpuMemoryMutable(x), incx);
}
bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
blas::Transpose trans, blas::Diagonal diag, uint64 n,
uint64 k, const DeviceMemory<double> &a, int lda,
DeviceMemory<double> *x, int incx) {
return DoBlasInternal(cublasDtbmv, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
CUDABlasDiagonal(diag), n, k, GpuMemory(a), lda,
GpuMemoryMutable(x), incx);
}
bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
blas::Transpose trans, blas::Diagonal diag, uint64 n,
uint64 k, const DeviceMemory<std::complex<float>> &a,
int lda, DeviceMemory<std::complex<float>> *x,
int incx) {
return DoBlasInternal(cublasCtbmv, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
CUDABlasDiagonal(diag), n, k, GpuComplex(GpuMemory(a)),
lda, GpuComplex(GpuMemoryMutable(x)), incx);
}
bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
blas::Transpose trans, blas::Diagonal diag, uint64 n,
uint64 k, const DeviceMemory<std::complex<double>> &a,
int lda, DeviceMemory<std::complex<double>> *x,
int incx) {
return DoBlasInternal(cublasZtbmv, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
CUDABlasDiagonal(diag), n, k, GpuComplex(GpuMemory(a)),
lda, GpuComplex(GpuMemoryMutable(x)), incx);
}
bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
blas::Transpose trans, blas::Diagonal diag, uint64 n,
uint64 k, const DeviceMemory<float> &a, int lda,
DeviceMemory<float> *x, int incx) {
return DoBlasInternal(cublasStbsv, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
CUDABlasDiagonal(diag), n, k, GpuMemory(a), lda,
GpuMemoryMutable(x), incx);
}
bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
blas::Transpose trans, blas::Diagonal diag, uint64 n,
uint64 k, const DeviceMemory<double> &a, int lda,
DeviceMemory<double> *x, int incx) {
return DoBlasInternal(cublasDtbsv, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
CUDABlasDiagonal(diag), n, k, GpuMemory(a), lda,
GpuMemoryMutable(x), incx);
}
bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
blas::Transpose trans, blas::Diagonal diag, uint64 n,
uint64 k, const DeviceMemory<std::complex<float>> &a,
int lda, DeviceMemory<std::complex<float>> *x,
int incx) {
return DoBlasInternal(cublasCtbsv, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
CUDABlasDiagonal(diag), n, k, GpuComplex(GpuMemory(a)),
lda, GpuComplex(GpuMemoryMutable(x)), incx);
}
bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
blas::Transpose trans, blas::Diagonal diag, uint64 n,
uint64 k, const DeviceMemory<std::complex<double>> &a,
int lda, DeviceMemory<std::complex<double>> *x,
int incx) {
return DoBlasInternal(cublasZtbsv, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
CUDABlasDiagonal(diag), n, k, GpuComplex(GpuMemory(a)),
lda, GpuComplex(GpuMemoryMutable(x)), incx);
}
bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
blas::Transpose trans, blas::Diagonal diag, uint64 n,
const DeviceMemory<float> &ap, DeviceMemory<float> *x,
int incx) {
return DoBlasInternal(cublasStpmv, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
CUDABlasDiagonal(diag), n, GpuMemory(ap),
GpuMemoryMutable(x), incx);
}
bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
blas::Transpose trans, blas::Diagonal diag, uint64 n,
const DeviceMemory<double> &ap,
DeviceMemory<double> *x, int incx) {
return DoBlasInternal(cublasDtpmv, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
CUDABlasDiagonal(diag), n, GpuMemory(ap),
GpuMemoryMutable(x), incx);
}
bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
blas::Transpose trans, blas::Diagonal diag, uint64 n,
const DeviceMemory<std::complex<float>> &ap,
DeviceMemory<std::complex<float>> *x, int incx) {
return DoBlasInternal(cublasCtpmv, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(ap)),
GpuComplex(GpuMemoryMutable(x)), incx);
}
bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
blas::Transpose trans, blas::Diagonal diag, uint64 n,
const DeviceMemory<std::complex<double>> &ap,
DeviceMemory<std::complex<double>> *x, int incx) {
return DoBlasInternal(cublasZtpmv, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(ap)),
GpuComplex(GpuMemoryMutable(x)), incx);
}
bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
blas::Transpose trans, blas::Diagonal diag, uint64 n,
const DeviceMemory<float> &ap, DeviceMemory<float> *x,
int incx) {
return DoBlasInternal(cublasStpsv, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
CUDABlasDiagonal(diag), n, GpuMemory(ap),
GpuMemoryMutable(x), incx);
}
bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
blas::Transpose trans, blas::Diagonal diag, uint64 n,
const DeviceMemory<double> &ap,
DeviceMemory<double> *x, int incx) {
return DoBlasInternal(cublasDtpsv, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
CUDABlasDiagonal(diag), n, GpuMemory(ap),
GpuMemoryMutable(x), incx);
}
bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
blas::Transpose trans, blas::Diagonal diag, uint64 n,
const DeviceMemory<std::complex<float>> &ap,
DeviceMemory<std::complex<float>> *x, int incx) {
return DoBlasInternal(cublasCtpsv, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(ap)),
GpuComplex(GpuMemoryMutable(x)), incx);
}
bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
blas::Transpose trans, blas::Diagonal diag, uint64 n,
const DeviceMemory<std::complex<double>> &ap,
DeviceMemory<std::complex<double>> *x, int incx) {
return DoBlasInternal(cublasZtpsv, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(ap)),
GpuComplex(GpuMemoryMutable(x)), incx);
}
bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
blas::Transpose trans, blas::Diagonal diag, uint64 n,
const DeviceMemory<float> &a, int lda,
DeviceMemory<float> *x, int incx) {
return DoBlasInternal(cublasStrmv, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
CUDABlasDiagonal(diag), n, GpuMemory(a), lda,
GpuMemoryMutable(x), incx);
}
bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
blas::Transpose trans, blas::Diagonal diag, uint64 n,
const DeviceMemory<double> &a, int lda,
DeviceMemory<double> *x, int incx) {
return DoBlasInternal(cublasDtrmv, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
CUDABlasDiagonal(diag), n, GpuMemory(a), lda,
GpuMemoryMutable(x), incx);
}
bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
blas::Transpose trans, blas::Diagonal diag, uint64 n,
const DeviceMemory<std::complex<float>> &a, int lda,
DeviceMemory<std::complex<float>> *x, int incx) {
return DoBlasInternal(cublasCtrmv, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(a)),
lda, GpuComplex(GpuMemoryMutable(x)), incx);
}
bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
blas::Transpose trans, blas::Diagonal diag, uint64 n,
const DeviceMemory<std::complex<double>> &a, int lda,
DeviceMemory<std::complex<double>> *x, int incx) {
return DoBlasInternal(cublasZtrmv, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(a)),
lda, GpuComplex(GpuMemoryMutable(x)), incx);
}
bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
blas::Transpose trans, blas::Diagonal diag, uint64 n,
const DeviceMemory<float> &a, int lda,
DeviceMemory<float> *x, int incx) {
return DoBlasInternal(cublasStrsv, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
CUDABlasDiagonal(diag), n, GpuMemory(a), lda,
GpuMemoryMutable(x), incx);
}
bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
blas::Transpose trans, blas::Diagonal diag, uint64 n,
const DeviceMemory<double> &a, int lda,
DeviceMemory<double> *x, int incx) {
return DoBlasInternal(cublasDtrsv, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
CUDABlasDiagonal(diag), n, GpuMemory(a), lda,
GpuMemoryMutable(x), incx);
}
bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
blas::Transpose trans, blas::Diagonal diag, uint64 n,
const DeviceMemory<std::complex<float>> &a, int lda,
DeviceMemory<std::complex<float>> *x, int incx) {
return DoBlasInternal(cublasCtrsv, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(a)),
lda, GpuComplex(GpuMemoryMutable(x)), incx);
}
bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
blas::Transpose trans, blas::Diagonal diag, uint64 n,
const DeviceMemory<std::complex<double>> &a, int lda,
DeviceMemory<std::complex<double>> *x, int incx) {
return DoBlasInternal(cublasZtrsv, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(a)),
lda, GpuComplex(GpuMemoryMutable(x)), incx);
}
bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
blas::Transpose transb, uint64 m, uint64 n, uint64 k,
float alpha, const DeviceMemory<Eigen::half> &a,
int lda, const DeviceMemory<Eigen::half> &b, int ldb,
float beta, DeviceMemory<Eigen::half> *c, int ldc) {
#if CUDA_VERSION >= 7050
VLOG(1) << absl::StrFormat(
"doing cuBLAS SGEMM: at=%d bt=%d m=%u n=%u "
"k=%u alpha=%f a=%p lda=%d b=%p ldb=%d beta=%f "
"c=%p ldc=%d",
static_cast<int>(transa), static_cast<int>(transb), m, n, k, alpha,
a.opaque(), lda, b.opaque(), ldb, beta, c->opaque(), ldc);
if (transa == blas::Transpose::kNoTranspose) {
if (lda < static_cast<int64>(m)) {
LOG(WARNING) << "GEMM lda was smaller than m (no transpose case); "
"precondition violation";
}
} else {
if (lda < static_cast<int64>(k)) {
LOG(WARNING) << "GEMM lda (" << lda << ") was smaller than k (" << k
<< ") (transpose case); precondition violation";
}
}
if (transb == blas::Transpose::kNoTranspose) {
if (ldb < static_cast<int64>(k)) {
LOG(WARNING) << "GEMM ldb (" << ldb << ") was smaller than k (" << k
<< ") (no transpose case); precondition violation";
}
} else {
if (ldb < static_cast<int64>(n)) {
LOG(WARNING) << "GEMM ldb was smaller than n (transpose case); "
"precondition violation";
}
}
#if CUDA_VERSION < 11000
cublasMath_t math_type = CUBLAS_TENSOR_OP_MATH;
#else
cublasMath_t math_type = CUBLAS_DEFAULT_MATH;
#endif
return DoBlasInternalImpl(
cublasSgemmEx, stream, true /* = pointer_mode_host */,
true /* = err_on_failure= */, math_type, CUDABlasTranspose(transa),
CUDABlasTranspose(transb), m, n, k, &alpha, GpuMemory(a),
SE_CUDA_DATA_HALF, lda, GpuMemory(b), SE_CUDA_DATA_HALF, ldb, &beta,
GpuMemoryMutable(c), SE_CUDA_DATA_HALF, ldc);
#else
LOG(ERROR) << "fp16 sgemm is not implemented in this cuBLAS version "
<< "(need at least CUDA 7.5)";
return false;
#endif
}
bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
blas::Transpose transb, uint64 m, uint64 n, uint64 k,
float alpha, const DeviceMemory<float> &a, int lda,
const DeviceMemory<float> &b, int ldb, float beta,
DeviceMemory<float> *c, int ldc) {
VLOG(1) << absl::StrFormat(
"doing cuBLAS SGEMM: at=%d bt=%d m=%u n=%u "
"k=%u alpha=%f a=%p lda=%d b=%p ldb=%d beta=%f "
"c=%p ldc=%d",
static_cast<int>(transa), static_cast<int>(transb), m, n, k, alpha,
a.opaque(), lda, b.opaque(), ldb, beta, c->opaque(), ldc);
if (transa == blas::Transpose::kNoTranspose) {
if (lda < static_cast<int64>(m)) {
LOG(WARNING) << "GEMM lda was smaller than m (no transpose case); "
"precondition violation";
}
} else {
if (lda < static_cast<int64>(k)) {
LOG(WARNING) << "GEMM lda (" << lda << ") was smaller than k (" << k
<< ") (transpose case); precondition violation";
}
}
if (transb == blas::Transpose::kNoTranspose) {
if (ldb < static_cast<int64>(k)) {
LOG(WARNING) << "GEMM ldb (" << ldb << ") was smaller than k (" << k
<< ") (no transpose case); precondition violation";
}
} else {
if (ldb < static_cast<int64>(n)) {
LOG(WARNING) << "GEMM ldb was smaller than n (transpose case); "
"precondition violation";
}
}
#if CUDA_VERSION < 11000
cublasMath_t math_type = CUBLAS_DEFAULT_MATH;
#else
cublasMath_t math_type = CUBLAS_TF32_TENSOR_OP_MATH;
int cc_major, cc_minor;
if (stream->parent()->GetDeviceDescription().cuda_compute_capability(
&cc_major, &cc_minor) &&
cc_major >= 8) {
// TODO(reedwm): Remove or make this VLOG(1) once TensorFloat-32 is more
// well tested.
VLOG(2) << "TensorFloat-32 will be used for the matrix multiplication.";
}
#endif
return DoBlasInternalImpl(
cublasSgemm, stream, true /* = pointer_mode_host */,
true /* = err_on_failure */, math_type, CUDABlasTranspose(transa),
CUDABlasTranspose(transb), m, n, k, &alpha, GpuMemory(a), lda,
GpuMemory(b), ldb, &beta, GpuMemoryMutable(c), ldc);
}
bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
blas::Transpose transb, uint64 m, uint64 n, uint64 k,
double alpha, const DeviceMemory<double> &a, int lda,
const DeviceMemory<double> &b, int ldb, double beta,
DeviceMemory<double> *c, int ldc) {
return DoBlasInternal(cublasDgemm, stream, true /* = pointer_mode_host */,
CUDABlasTranspose(transa), CUDABlasTranspose(transb), m,
n, k, &alpha, GpuMemory(a), lda, GpuMemory(b), ldb,
&beta, GpuMemoryMutable(c), ldc);
}
bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
blas::Transpose transb, uint64 m, uint64 n, uint64 k,
std::complex<float> alpha,
const DeviceMemory<std::complex<float>> &a, int lda,
const DeviceMemory<std::complex<float>> &b, int ldb,
std::complex<float> beta,
DeviceMemory<std::complex<float>> *c, int ldc) {
auto cb_alpha = GpuComplexValue(alpha);
auto cb_beta = GpuComplexValue(beta);
return DoBlasInternal(cublasCgemm, stream, true /* = pointer_mode_host */,
CUDABlasTranspose(transa), CUDABlasTranspose(transb), m,
n, k, GpuComplex(&cb_alpha), GpuComplex(GpuMemory(a)),
lda, GpuComplex(GpuMemory(b)), ldb,
GpuComplex(&cb_beta), GpuComplex(GpuMemoryMutable(c)),
ldc);
}
bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
blas::Transpose transb, uint64 m, uint64 n, uint64 k,
std::complex<double> alpha,
const DeviceMemory<std::complex<double>> &a, int lda,
const DeviceMemory<std::complex<double>> &b, int ldb,
std::complex<double> beta,
DeviceMemory<std::complex<double>> *c, int ldc) {
auto cb_alpha = GpuComplexValue(alpha);
auto cb_beta = GpuComplexValue(beta);
return DoBlasInternal(cublasZgemm, stream, true /* = pointer_mode_host */,
CUDABlasTranspose(transa), CUDABlasTranspose(transb), m,
n, k, GpuComplex(&cb_alpha), GpuComplex(GpuMemory(a)),
lda, GpuComplex(GpuMemory(b)), ldb,
GpuComplex(&cb_beta), GpuComplex(GpuMemoryMutable(c)),
ldc);
}
bool CUDABlas::DoBlasGemvWithProfiling(
Stream *stream, blas::Transpose trans, uint64 m, uint64 n, float alpha,
const DeviceMemory<float> &a, int lda, const DeviceMemory<float> &x,
int incx, float beta, DeviceMemory<float> *y, int incy,
blas::ProfileResult *output_profile_result) {
return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x,
incx, beta, y, incy,
output_profile_result);
}
bool CUDABlas::DoBlasGemvWithProfiling(
Stream *stream, blas::Transpose trans, uint64 m, uint64 n, double alpha,
const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &x,
int incx, double beta, DeviceMemory<double> *y, int incy,
blas::ProfileResult *output_profile_result) {
return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x,
incx, beta, y, incy,
output_profile_result);
}
bool CUDABlas::DoBlasGemvWithProfiling(
Stream *stream, blas::Transpose trans, uint64 m, uint64 n,
std::complex<float> alpha, const DeviceMemory<std::complex<float>> &a,
int lda, const DeviceMemory<std::complex<float>> &x, int incx,
std::complex<float> beta, DeviceMemory<std::complex<float>> *y, int incy,
blas::ProfileResult *output_profile_result) {
return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x,
incx, beta, y, incy,
output_profile_result);
}
bool CUDABlas::DoBlasGemvWithProfiling(
Stream *stream, blas::Transpose trans, uint64 m, uint64 n,
std::complex<double> alpha, const DeviceMemory<std::complex<double>> &a,
int lda, const DeviceMemory<std::complex<double>> &x, int incx,
std::complex<double> beta, DeviceMemory<std::complex<double>> *y, int incy,
blas::ProfileResult *output_profile_result) {
return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x,
incx, beta, y, incy,
output_profile_result);
}
bool CUDABlas::DoBlasGemmWithProfiling(
Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
uint64 n, uint64 k, float alpha, const DeviceMemory<Eigen::half> &a,
int lda, const DeviceMemory<Eigen::half> &b, int ldb, float beta,
DeviceMemory<Eigen::half> *c, int ldc,
blas::ProfileResult *output_profile_result) {
return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
lda, b, ldb, beta, c, ldc,
output_profile_result);
}
bool CUDABlas::DoBlasGemmWithProfiling(
Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
uint64 n, uint64 k, float alpha, const DeviceMemory<float> &a, int lda,
const DeviceMemory<float> &b, int ldb, float beta, DeviceMemory<float> *c,
int ldc, blas::ProfileResult *output_profile_result) {
return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
lda, b, ldb, beta, c, ldc,
output_profile_result);
}
bool CUDABlas::DoBlasGemmWithProfiling(
Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
uint64 n, uint64 k, double alpha, const DeviceMemory<double> &a, int lda,
const DeviceMemory<double> &b, int ldb, double beta,
DeviceMemory<double> *c, int ldc,
blas::ProfileResult *output_profile_result) {
return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
lda, b, ldb, beta, c, ldc,
output_profile_result);
}
bool CUDABlas::DoBlasGemmWithProfiling(
Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
uint64 n, uint64 k, std::complex<float> alpha,
const DeviceMemory<std::complex<float>> &a, int lda,
const DeviceMemory<std::complex<float>> &b, int ldb,
std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc,
blas::ProfileResult *output_profile_result) {
return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
lda, b, ldb, beta, c, ldc,
output_profile_result);
}
bool CUDABlas::DoBlasGemmWithProfiling(
Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
uint64 n, uint64 k, std::complex<double> alpha,
const DeviceMemory<std::complex<double>> &a, int lda,
const DeviceMemory<std::complex<double>> &b, int ldb,
std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc,
blas::ProfileResult *output_profile_result) {
return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
lda, b, ldb, beta, c, ldc,
output_profile_result);
}
template <typename T>
bool CUDABlas::DoBlasGemvWithProfilingImpl(
Stream *stream, blas::Transpose trans, uint64 m, uint64 n, const T &alpha,
const DeviceMemory<T> &a, int lda, const DeviceMemory<T> &x, int incx,
const T &beta, DeviceMemory<T> *y, int incy,
blas::ProfileResult *output_profile_result) {
std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
if (output_profile_result != nullptr) {
timer.reset(new GpuTimer(parent_));
if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
return false;
}
}
// Call blasGemm
bool result =
DoBlasGemv(stream, trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
if (timer != nullptr && result) {
// GpuTimer will CHECK-fail if we Stop() it while the stream is in an error
// state.
if (!timer->Stop(AsGpuStream(stream))) {
return false;
}
output_profile_result->set_is_valid(true);
output_profile_result->set_algorithm(blas::kDefaultBlasGemv);
output_profile_result->set_elapsed_time_in_ms(
timer->GetElapsedMilliseconds());
}
return result;
}
template <typename T, typename ParamType>
bool CUDABlas::DoBlasGemmWithProfilingImpl(
Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
uint64 n, uint64 k, const ParamType &alpha, const DeviceMemory<T> &a,
int lda, const DeviceMemory<T> &b, int ldb, const ParamType &beta,
DeviceMemory<T> *c, int ldc, blas::ProfileResult *output_profile_result) {
std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
if (output_profile_result != nullptr) {
timer.reset(new GpuTimer(parent_));
if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
return false;
}
}
// Call blasGemm
bool result = DoBlasGemm(stream, transa, transb, m, n, k, alpha, a, lda, b,
ldb, beta, c, ldc);
if (timer != nullptr && result) {
// GpuTimer will CHECK-fail if we Stop() it while the stream is in an error
// state.
if (!timer->Stop(AsGpuStream(stream))) {
return false;
}
output_profile_result->set_is_valid(true);
output_profile_result->set_algorithm(blas::kDefaultBlasGemm);
output_profile_result->set_elapsed_time_in_ms(
timer->GetElapsedMilliseconds());
}
return result;
}
static bool UsesTensorOps(blas::AlgorithmType algo) {
#if CUDA_VERSION >= 9000
cublasGemmAlgo_t cublas_algo = static_cast<cublasGemmAlgo_t>(algo);
return cublas_algo >= CUBLAS_GEMM_DEFAULT_TENSOR_OP;
#else
return false;
#endif
}
template <typename InT, typename OutT, typename CompT>
bool CUDABlas::DoBlasGemmWithAlgorithmImpl(
Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
uint64 n, uint64 k, const HostOrDeviceScalar<CompT> &alpha,
const DeviceMemory<InT> &a, int lda, const DeviceMemory<InT> &b, int ldb,
const HostOrDeviceScalar<CompT> &beta, DeviceMemory<OutT> *c, int ldc,
blas::ComputationType computation_type, blas::AlgorithmType algorithm,
blas::ProfileResult *output_profile_result) {
// GPUs < sm_50 don't support cublasGemmEx.
int cc_major, cc_minor;
if (stream->parent()->GetDeviceDescription().cuda_compute_capability(
&cc_major, &cc_minor) &&
cc_major < 5) {
VLOG(2) << "DoBlasGemmWithAlgorithm returning false because sm" << cc_major
<< cc_minor << " devices don't support explicit gemm algorithms.";
return false;
}
bool algo_uses_tensor_ops = UsesTensorOps(algorithm);
cublasMath_t math_type = CUBLAS_DEFAULT_MATH;
if (algo_uses_tensor_ops) {
if (cc_major < 7) {
VLOG(2) << "DoBlasGemmWithAlgorithm returning false because algorithm "
<< algorithm
<< " uses tensor ops, but tensor ops are not available in sm"
<< cc_major << "X devices.";
return false;
} else if (std::is_same<InT, float>::value) {
#if CUDA_VERSION < 11000
VLOG(2) << "DoBlasGemmWithAlgorithm returning false because algorithm "
<< algorithm
<< " uses tensor ops, but tensor ops are not available for fp32"
<< " inputs.";
return false;
#else
if (cc_major < 8) {
VLOG(2) << "DoBlasGemmWithAlgorithm returning false because algorithm "
<< algorithm
<< " uses tensor ops, but tensor ops are not available in sm"
<< cc_major << "X devices for float input types.";
return false;
} else if (!tensorflow::tensor_float_32_execution_enabled()) {
VLOG(2) << "DoBlasGemmWithAlgorithm returning false because algorithm "
<< algorithm
<< " uses tensor ops, but tensor ops are disabled for fp32"
<< " inputs.";
return false;
}
math_type = CUBLAS_TF32_TENSOR_OP_MATH;
#endif
} else if (std::is_same<InT, Eigen::half>::value) {
#if CUDA_VERSION < 11000
math_type = CUBLAS_TENSOR_OP_MATH;
#endif
} else {
VLOG(2) << "DoBlasGemmWithAlgorithm returning false because algorithm "
<< algorithm
<< " uses tensor ops, which are not supported for InT.";
return false;
}
}
// Either both 'alpha' and 'beta' need to be pointers to device memory, or
// they need to be both host scalars.
if (alpha.is_pointer() != beta.is_pointer()) {
VLOG(2) << "DoBlasGemmWithAlgorithm returning false because one of `alpha` "
"and `beta` is a pointer, but the other is not.";
return false;
}
std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
if (output_profile_result != nullptr) {
timer.reset(new GpuTimer(parent_));
if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
VLOG(2) << "DoBlasGemmWithAlgorithm returning false because "
"output_profile_result was given, but we were unable to "
"create a GpuTimer.";
return false;
}
}
// Return false if we might be hitting a cuBLAS bug that produces the wrong
// result. See nvbugs/2156201, b/79126339.
#if CUDA_VERSION >= 9000 && CUDA_VERSION < 9020
if ((algorithm == CUBLAS_GEMM_DEFAULT || algorithm >= CUBLAS_GEMM_ALGO13) &&
std::max({m, n, k}) >= 2097153 && cc_major < 7) {
VLOG(2) << "DoBlasGemmWithAlgorithm returning false to work around cudnn "
"<9.2 bug with m, n, or k >= 2097153. See b/79126339.";
return false;
}
#endif
cudaDataType_t cuda_in_type = CUDADataType<InT>::type;
// Since we are converting 'algorithm' to cublasGemmAlgo_t by static_cast,
// we do the following compile-time check on the default value:
static_assert(blas::kDefaultGemmAlgo == CUBLAS_GEMM_DFALT, "");
// If 'alpha' and 'beta' are host scalars and CompT is Eigen::half, we
// essentially reinterpet_cast to __half, which is safe because Eigen::half
// inherits from __half.
cublasStatus_t (*gemm)(cublasHandle_t, cublasOperation_t, cublasOperation_t,
int, int, int, const void *, const void *,
cudaDataType, int, const void *, cudaDataType, int,
const void *, void *, cudaDataType, int, cudaDataType,
cublasGemmAlgo_t) = cublasGemmEx;
bool result = DoBlasInternalImpl(
gemm, stream,
/* pointer_mode_host = */ !alpha.is_pointer(), /*err_on_failure=*/false,
math_type, CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
alpha.is_pointer() ? GpuMemory(alpha.pointer()) : &alpha.value(),
GpuMemory(a), cuda_in_type, lda, GpuMemory(b), cuda_in_type, ldb,
beta.is_pointer() ? GpuMemory(beta.pointer()) : &beta.value(),
GpuMemoryMutable(c), CUDADataType<OutT>::type, ldc,
CUDAComputationType(computation_type),
static_cast<cublasGemmAlgo_t>(algorithm));
if (timer != nullptr && result) {
// GpuTimer will CHECK-fail if we Stop() it while the stream is in an error
// state.
if (!timer->Stop(AsGpuStream(stream))) {
VLOG(2) << "DoBlasGemmWithAlgorithm returning false; unable to stop "
"GpuTimer.";
return false;
}
output_profile_result->set_is_valid(true);
output_profile_result->set_algorithm(algorithm);
output_profile_result->set_elapsed_time_in_ms(
timer->GetElapsedMilliseconds());
}
return result;
}
bool CUDABlas::GetBlasGemmAlgorithms(
std::vector<blas::AlgorithmType> *out_algorithms) {
// cublasGemmAlgo_t (and the function that accepts this type, cublasGemmEx)
// were first introduced in CUDA 8.
//
// Note that when CUDA version and compute capability is not sufficient, we
// still return the out_algorithms. Caller needs to make sure that in this
// case, the returned vector is empty.
*out_algorithms = {
CUBLAS_GEMM_DFALT,
CUBLAS_GEMM_ALGO0,
CUBLAS_GEMM_ALGO1,
CUBLAS_GEMM_ALGO2,
CUBLAS_GEMM_ALGO3,
CUBLAS_GEMM_ALGO4,
CUBLAS_GEMM_ALGO5,
CUBLAS_GEMM_ALGO6,
CUBLAS_GEMM_ALGO7,
#if CUDA_VERSION >= 9000
CUBLAS_GEMM_ALGO8,
CUBLAS_GEMM_ALGO9,
CUBLAS_GEMM_ALGO10,
CUBLAS_GEMM_ALGO11,
CUBLAS_GEMM_ALGO12,
CUBLAS_GEMM_ALGO13,
CUBLAS_GEMM_ALGO14,
CUBLAS_GEMM_ALGO15,
CUBLAS_GEMM_ALGO16,
CUBLAS_GEMM_ALGO17,
CUBLAS_GEMM_DFALT_TENSOR_OP,
CUBLAS_GEMM_ALGO0_TENSOR_OP,
CUBLAS_GEMM_ALGO1_TENSOR_OP,
CUBLAS_GEMM_ALGO2_TENSOR_OP,
CUBLAS_GEMM_ALGO3_TENSOR_OP,
CUBLAS_GEMM_ALGO4_TENSOR_OP,
#endif
#if CUDA_VERSION >= 9020
CUBLAS_GEMM_ALGO18,
CUBLAS_GEMM_ALGO19,
CUBLAS_GEMM_ALGO20,
CUBLAS_GEMM_ALGO21,
CUBLAS_GEMM_ALGO22,
CUBLAS_GEMM_ALGO23,
CUBLAS_GEMM_ALGO5_TENSOR_OP,
CUBLAS_GEMM_ALGO6_TENSOR_OP,
CUBLAS_GEMM_ALGO7_TENSOR_OP,
CUBLAS_GEMM_ALGO8_TENSOR_OP,
CUBLAS_GEMM_ALGO9_TENSOR_OP,
CUBLAS_GEMM_ALGO10_TENSOR_OP,
CUBLAS_GEMM_ALGO11_TENSOR_OP,
CUBLAS_GEMM_ALGO12_TENSOR_OP,
CUBLAS_GEMM_ALGO13_TENSOR_OP,
CUBLAS_GEMM_ALGO14_TENSOR_OP,
CUBLAS_GEMM_ALGO15_TENSOR_OP,
#endif
};
return true;
}
bool CUDABlas::DoBlasGemmWithAlgorithm(
Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
uint64 n, uint64 k, const HostOrDeviceScalar<int> &alpha,
const DeviceMemory<int8> &a, int lda, const DeviceMemory<int8> &b, int ldb,
const HostOrDeviceScalar<int> &beta, DeviceMemory<int> *c, int ldc,
blas::ComputationType computation_type, blas::AlgorithmType algorithm,
blas::ProfileResult *output_profile_result) {
return DoBlasGemmWithAlgorithmImpl(
stream, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
computation_type, algorithm, output_profile_result);
}
bool CUDABlas::DoBlasGemmWithAlgorithm(
Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
uint64 n, uint64 k, const HostOrDeviceScalar<Eigen::half> &alpha,
const DeviceMemory<Eigen::half> &a, int lda,
const DeviceMemory<Eigen::half> &b, int ldb,
const HostOrDeviceScalar<Eigen::half> &beta, DeviceMemory<Eigen::half> *c,
int ldc, blas::ComputationType computation_type,
blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
if (computation_type == blas::ComputationType::kF32) {
if (alpha.is_pointer() || beta.is_pointer()) {
// We cannot easily convert a pointer to f16 memory to a pointer to f32
// memory from here, so we don't support this for now.
// TODO(akuegel): Investigate whether we can do the conversion before
// calling DoBlasGemmWithAlgorithm.
return false;
}
HostOrDeviceScalar<float> float_alpha(static_cast<float>(alpha.value()));
HostOrDeviceScalar<float> float_beta(static_cast<float>(beta.value()));
return DoBlasGemmWithAlgorithmImpl(
stream, transa, transb, m, n, k, float_alpha, a, lda, b, ldb,
float_beta, c, ldc, computation_type, algorithm, output_profile_result);
}
CHECK_EQ(computation_type, blas::ComputationType::kF16);
return DoBlasGemmWithAlgorithmImpl(
stream, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
computation_type, algorithm, output_profile_result);
}
bool CUDABlas::DoBlasGemmWithAlgorithm(
Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
uint64 n, uint64 k, const HostOrDeviceScalar<float> &alpha,
const DeviceMemory<float> &a, int lda, const DeviceMemory<float> &b,
int ldb, const HostOrDeviceScalar<float> &beta, DeviceMemory<float> *c,
int ldc, blas::ComputationType computation_type,
blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
return DoBlasGemmWithAlgorithmImpl(
stream, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
computation_type, algorithm, output_profile_result);
}
bool CUDABlas::DoBlasGemmWithAlgorithm(
Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
uint64 n, uint64 k, const HostOrDeviceScalar<double> &alpha,
const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &b,
int ldb, const HostOrDeviceScalar<double> &beta, DeviceMemory<double> *c,
int ldc, blas::ComputationType computation_type,
blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
return DoBlasGemmWithAlgorithmImpl(
stream, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
computation_type, algorithm, output_profile_result);
}
bool CUDABlas::DoBlasGemmWithAlgorithm(
Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
uint64 n, uint64 k, const HostOrDeviceScalar<std::complex<float>> &alpha,
const DeviceMemory<std::complex<float>> &a, int lda,
const DeviceMemory<std::complex<float>> &b, int ldb,
const HostOrDeviceScalar<std::complex<float>> &beta,
DeviceMemory<std::complex<float>> *c, int ldc,
blas::ComputationType computation_type, blas::AlgorithmType algorithm,
blas::ProfileResult *output_profile_result) {
return DoBlasGemmWithAlgorithmImpl(
stream, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
computation_type, algorithm, output_profile_result);
}
bool CUDABlas::DoBlasGemmWithAlgorithm(
Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
uint64 n, uint64 k, const HostOrDeviceScalar<std::complex<double>> &alpha,
const DeviceMemory<std::complex<double>> &a, int lda,
const DeviceMemory<std::complex<double>> &b, int ldb,
const HostOrDeviceScalar<std::complex<double>> &beta,
DeviceMemory<std::complex<double>> *c, int ldc,
blas::ComputationType computation_type, blas::AlgorithmType algorithm,
blas::ProfileResult *output_profile_result) {
return DoBlasGemmWithAlgorithmImpl(
stream, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
computation_type, algorithm, output_profile_result);
}
template <typename T>
struct HalfAsFloat {
typedef T type;
};
template <>
struct HalfAsFloat<Eigen::half> {
typedef float type;
};
namespace {
// pass-through for non-complex types that don't need conversion to
// cublas-specific type.
template <typename T>
T inline GpuComplexValue(T v) {
return v;
}
} // namespace
template <typename T, typename Scalar, typename FuncT>
port::Status CUDABlas::DoBlasGemmBatchedInternal(
FuncT cublas_func, Stream *stream, blas::Transpose transa,
blas::Transpose transb, uint64 m, uint64 n, uint64 k, Scalar alpha,
const port::ArraySlice<DeviceMemory<T> *> &a_ptrs_to_wrappers, int lda,
const port::ArraySlice<DeviceMemory<T> *> &b_ptrs_to_wrappers, int ldb,
Scalar beta, const port::ArraySlice<DeviceMemory<T> *> &c_ptrs_to_wrappers,
int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
std::vector<T *> a_raw_ptrs, b_raw_ptrs, c_raw_ptrs;
for (int i = 0; i < batch_count; ++i) {
a_raw_ptrs.push_back(static_cast<T *>(a_ptrs_to_wrappers[i]->opaque()));
b_raw_ptrs.push_back(static_cast<T *>(b_ptrs_to_wrappers[i]->opaque()));
c_raw_ptrs.push_back(static_cast<T *>(c_ptrs_to_wrappers[i]->opaque()));
}
typedef typename HalfAsFloat<typename GpuComplexT<T>::type>::type CUDA_T;
const size_t size = batch_count * sizeof(CUDA_T *);
// Device-side copy of pointers to matrices.
DeviceMemory<CUDA_T *> a;
DeviceMemory<CUDA_T *> b;
DeviceMemory<CUDA_T *> c;
// If temporary space is allocated for device-side copies of pointers to
// matrices, that temporary space should not be freed until this function
// returns. Although the values for these unique_ptrs are not set here, they
// are declared at this scope so they will be destroyed when the function
// returns.
//
// If a scratch allocator is provided, these pointers will not be used at all.
std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> a_temporary;
std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> b_temporary;
std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> c_temporary;
// Decide how to allocate device-side copy of pointers to matrices based on
// whether a scratch allocator was passed.
if (scratch_allocator != nullptr) {
SE_ASSIGN_OR_RETURN(DeviceMemory<uint8> a_bytes,
scratch_allocator->AllocateBytes(size));
SE_ASSIGN_OR_RETURN(DeviceMemory<uint8> b_bytes,
scratch_allocator->AllocateBytes(size));
SE_ASSIGN_OR_RETURN(DeviceMemory<uint8> c_bytes,
scratch_allocator->AllocateBytes(size));
a = DeviceMemory<CUDA_T *>(a_bytes);
b = DeviceMemory<CUDA_T *>(b_bytes);
c = DeviceMemory<CUDA_T *>(c_bytes);
} else {
SE_ASSIGN_OR_RETURN(a_temporary,
stream->AllocateTemporaryArray<CUDA_T *>(batch_count));
SE_ASSIGN_OR_RETURN(b_temporary,
stream->AllocateTemporaryArray<CUDA_T *>(batch_count));
SE_ASSIGN_OR_RETURN(c_temporary,
stream->AllocateTemporaryArray<CUDA_T *>(batch_count));
a = DeviceMemory<CUDA_T *>(*a_temporary->mutable_device_memory());
b = DeviceMemory<CUDA_T *>(*b_temporary->mutable_device_memory());
c = DeviceMemory<CUDA_T *>(*c_temporary->mutable_device_memory());
}
if (!stream->ThenMemcpy(&a, a_raw_ptrs.data(), size).ok() ||
!stream->ThenMemcpy(&b, b_raw_ptrs.data(), size).ok() ||
!stream->ThenMemcpy(&c, c_raw_ptrs.data(), size).ok()) {
return port::Status(port::error::INTERNAL,
"failed to copy memory from host to device in "
"CUDABlas::DoBlasGemmBatched");
}
cudaDataType_t data_type = CUDADataType<T>::type;
#if CUDA_VERSION >= 9010
int cc_major, cc_minor;
if (stream->parent()->GetDeviceDescription().cuda_compute_capability(
&cc_major, &cc_minor) &&
cc_major >= 5) {
cublasMath_t math_type;
cublasGemmAlgo_t algo;
if (data_type == CUDA_R_16F) {
#if CUDA_VERSION < 11000
math_type = CUBLAS_TENSOR_OP_MATH;
#else
math_type = CUBLAS_DEFAULT_MATH;
#endif
algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
#if CUBLAS_VER_MAJOR >= 11
} else if (data_type == CUDA_R_32F) {
// DoBlassInternalImpl will switch math_type back to CUBLAS_DEFAULT_MATH
// if TensorFloat-32 is disabled.
math_type = CUBLAS_TF32_TENSOR_OP_MATH;
algo = tensorflow::tensor_float_32_execution_enabled()
? CUBLAS_GEMM_DFALT_TENSOR_OP
: CUBLAS_GEMM_DFALT;
#endif
} else {
math_type = CUBLAS_DEFAULT_MATH;
algo = CUBLAS_GEMM_DFALT;
}
cudaDataType_t compute_type =
(data_type == CUDA_R_16F ? CUDA_R_32F : data_type);
const void **a_void_ptrs = reinterpret_cast<const void **>(
const_cast<const CUDA_T **>(GpuMemory(a)));
const void **b_void_ptrs = reinterpret_cast<const void **>(
const_cast<const CUDA_T **>(GpuMemory(b)));
void **c_void_ptrs =
reinterpret_cast<void **>(const_cast<CUDA_T **>(GpuMemory(c)));
bool ok;
cublasStatus_t (*gemm)(cublasHandle_t, cublasOperation_t, cublasOperation_t,
int, int, int, const void *, const void *const[],
cudaDataType, int, const void *const[], cudaDataType,
int, const void *, void *const[], cudaDataType, int,
int, cudaDataType, cublasGemmAlgo_t) =
cublasGemmBatchedEx;
ok = DoBlasInternalImpl(
gemm, stream, true /* = pointer_mode_host */,
true /* = err_on_failure */, math_type, CUDABlasTranspose(transa),
CUDABlasTranspose(transb), m, n, k, &alpha, a_void_ptrs, data_type, lda,
b_void_ptrs, data_type, ldb, &beta, c_void_ptrs, data_type, ldc,
batch_count, compute_type, algo);
if (ok) {
return port::Status::OK();
}
return port::Status(port::error::INTERNAL,
"failed BLAS call, see log for details");
}
#endif
// either CUDA_VERSION < 9.1 or SM < 5.0
if (data_type != CUDA_R_16F) {
auto cb_alpha = GpuComplexValue(alpha);
auto cb_beta = GpuComplexValue(beta);
bool ok = DoBlasInternal(
cublas_func, stream, true /* = pointer_mode_host */,
CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
GpuComplex(&cb_alpha), const_cast<const CUDA_T **>(GpuMemory(a)), lda,
const_cast<const CUDA_T **>(GpuMemory(b)), ldb, GpuComplex(&cb_beta),
const_cast<CUDA_T **>(GpuMemory(c)), ldc, batch_count);
if (ok) {
return port::Status::OK();
}
return port::Status(port::error::INTERNAL,
"failed BLAS call, see log for details");
} else {
// Fall back to a loop for fp16
for (int b = 0; b < batch_count; ++b) {
const DeviceMemory<T> &a_matrix = *a_ptrs_to_wrappers[b];
const DeviceMemory<T> &b_matrix = *b_ptrs_to_wrappers[b];
DeviceMemory<T> *c_matrix = c_ptrs_to_wrappers[b];
bool ok = DoBlasGemm(stream, transa, transb, m, n, k, alpha, a_matrix,
lda, b_matrix, ldb, beta, c_matrix, ldc);
if (!ok) {
return port::Status(port::error::INTERNAL,
"failed BLAS call, see log for details");
}
}
return port::Status::OK();
}
}
template <typename T, typename Scalar, typename FuncT>
port::Status CUDABlas::DoBlasGemmBatchedInternal(
FuncT cublas_func, Stream *stream, blas::Transpose transa,
blas::Transpose transb, uint64 m, uint64 n, uint64 k, Scalar alpha,
const T **a_array, int lda, const T **b_array, int ldb, Scalar beta,
T **c_array, int ldc, int batch_count) {
cudaDataType_t data_type = CUDADataType<T>::type;
const void *const *a_void_ptrs =
reinterpret_cast<const void *const *>(a_array);
const void *const *b_void_ptrs =
reinterpret_cast<const void *const *>(b_array);
void *const *c_void_ptrs = reinterpret_cast<void *const *>(c_array);
#if CUDA_VERSION >= 9010
int cc_major, cc_minor;
if (stream->parent()->GetDeviceDescription().cuda_compute_capability(
&cc_major, &cc_minor) &&
cc_major >= 5) {
cublasMath_t math_type;
cublasGemmAlgo_t algo;
if (data_type == CUDA_R_16F) {
#if CUDA_VERSION < 11000
math_type = CUBLAS_TENSOR_OP_MATH;
#else
math_type = CUBLAS_DEFAULT_MATH;
#endif
algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
#if CUBLAS_VER_MAJOR >= 11
} else if (data_type == CUDA_R_32F) {
// DoBlassInternalImpl will switch math_type back to CUBLAS_DEFAULT_MATH
// if TensorFloat-32 is disabled.
math_type = CUBLAS_TF32_TENSOR_OP_MATH;
algo = tensorflow::tensor_float_32_execution_enabled()
? CUBLAS_GEMM_DFALT_TENSOR_OP
: CUBLAS_GEMM_DFALT;
#endif
} else {
math_type = CUBLAS_DEFAULT_MATH;
algo = CUBLAS_GEMM_DFALT;
}
cudaDataType_t compute_type =
(data_type == CUDA_R_16F ? CUDA_R_32F : data_type);
bool ok;
cublasStatus_t (*gemm)(cublasHandle_t, cublasOperation_t, cublasOperation_t,
int, int, int, const void *, const void *const[],
cudaDataType, int, const void *const[], cudaDataType,
int, const void *, void *const[], cudaDataType, int,
int, cudaDataType, cublasGemmAlgo_t) =
cublasGemmBatchedEx;
ok = DoBlasInternalImpl(
gemm, stream, true /* = pointer_mode_host */,
true /* = err_on_failure */, math_type, CUDABlasTranspose(transa),
CUDABlasTranspose(transb), m, n, k, &alpha, a_void_ptrs, data_type, lda,
b_void_ptrs, data_type, ldb, &beta, c_void_ptrs, data_type, ldc,
batch_count, compute_type, algo);
if (ok) {
return port::Status::OK();
}
return port::Status(port::error::INTERNAL,
"failed BLAS call, see log for details");
}
#endif
// either CUDA_VERSION < 9.1 or SM < 5.0
// if (data_type != CUDA_R_16F) {
// bool ok =
// DoBlasInternal(cublas_func, stream, true /* = pointer_mode_host */,
// CUDABlasTranspose(transa), CUDABlasTranspose(transb),
// m, n, k, GpuComplex(&alpha), a_void_ptrs, lda,
// b_void_ptrs, ldb, GpuComplex(&beta), c_void_ptrs,
// ldc, batch_count);
// if (ok) {
// return port::Status::OK();
// }
// return port::Status(port::error::INTERNAL,
// "failed BLAS call, see log for details");
// } else {
return port::Status(port::error::INTERNAL,
"CUDA_VERSION < 9.1 or SM < 5.0 not support.");
// }
}
bool CUDABlas::DoBlasGemmBatched(
Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
uint64 n, uint64 k, float alpha,
const port::ArraySlice<DeviceMemory<Eigen::half> *> &a_array, int lda,
const port::ArraySlice<DeviceMemory<Eigen::half> *> &b_array, int ldb,
float beta, const port::ArraySlice<DeviceMemory<Eigen::half> *> &c_array,
int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
// Note: The func passed here (cublasSgemmBatched) is not actually called,
// due to special handling of fp16 inside DoBlasGemmBatchedInternal.
port::Status status = DoBlasGemmBatchedInternal(
cublasSgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator);
if (!status.ok()) {
LOG(ERROR) << status;
}
return status.ok();
}
bool CUDABlas::DoBlasGemmBatched(
Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
uint64 n, uint64 k, float alpha,
const port::ArraySlice<DeviceMemory<float> *> &a_array, int lda,
const port::ArraySlice<DeviceMemory<float> *> &b_array, int ldb, float beta,
const port::ArraySlice<DeviceMemory<float> *> &c_array, int ldc,
int batch_count, ScratchAllocator *scratch_allocator) {
port::Status status = DoBlasGemmBatchedInternal(
cublasSgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator);
if (!status.ok()) {
LOG(ERROR) << status;
}
return status.ok();
}
bool CUDABlas::DoBlasGemmBatched(Stream *stream, blas::Transpose transa,
blas::Transpose transb, uint64 m, uint64 n,
uint64 k, float alpha,
const Eigen::half **a_array, int lda,
const Eigen::half **b_array, int ldb,
float beta, Eigen::half **c_array, int ldc,
int batch_count) {
port::Status status = DoBlasGemmBatchedInternal(
cublasSgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
b_array, ldb, beta, c_array, ldc, batch_count);
if (!status.ok()) {
LOG(ERROR) << status;
}
return status.ok();
}
bool CUDABlas::DoBlasGemmBatched(Stream *stream, blas::Transpose transa,
blas::Transpose transb, uint64 m, uint64 n,
uint64 k, float alpha, const float **a_array,
int lda, const float **b_array, int ldb,
float beta, float **c_array, int ldc,
int batch_count) {
port::Status status = DoBlasGemmBatchedInternal(
cublasSgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
b_array, ldb, beta, c_array, ldc, batch_count);
if (!status.ok()) {
LOG(ERROR) << status;
}
return status.ok();
}
bool CUDABlas::DoBlasGemmBatched(Stream *stream, blas::Transpose transa,
blas::Transpose transb, uint64 m, uint64 n,
uint64 k, double alpha, const double **a_array,
int lda, const double **b_array, int ldb,
double beta, double **c_array, int ldc,
int batch_count) {
port::Status status = DoBlasGemmBatchedInternal(
cublasDgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
b_array, ldb, beta, c_array, ldc, batch_count);
if (!status.ok()) {
LOG(ERROR) << status;
}
return status.ok();
}
bool CUDABlas::DoBlasGemmBatched(
Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
uint64 n, uint64 k, double alpha,
const port::ArraySlice<DeviceMemory<double> *> &a_array, int lda,
const port::ArraySlice<DeviceMemory<double> *> &b_array, int ldb,
double beta, const port::ArraySlice<DeviceMemory<double> *> &c_array,
int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
port::Status status = DoBlasGemmBatchedInternal(
cublasDgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator);
if (!status.ok()) {
LOG(ERROR) << status;
}
return status.ok();
}
bool CUDABlas::DoBlasGemmBatched(
Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
uint64 n, uint64 k, std::complex<float> alpha,
const port::ArraySlice<DeviceMemory<std::complex<float>> *> &a_array,
int lda,
const port::ArraySlice<DeviceMemory<std::complex<float>> *> &b_array,
int ldb, std::complex<float> beta,
const port::ArraySlice<DeviceMemory<std::complex<float>> *> &c_array,
int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
port::Status status = DoBlasGemmBatchedInternal(
cublasCgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator);
if (!status.ok()) {
LOG(ERROR) << status;
}
return status.ok();
}
bool CUDABlas::DoBlasGemmBatched(
Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
uint64 n, uint64 k, std::complex<double> alpha,
const port::ArraySlice<DeviceMemory<std::complex<double>> *> &a_array,
int lda,
const port::ArraySlice<DeviceMemory<std::complex<double>> *> &b_array,
int ldb, std::complex<double> beta,
const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c_array,
int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
port::Status status = DoBlasGemmBatchedInternal(
cublasZgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator);
if (!status.ok()) {
LOG(ERROR) << status;
}
return status.ok();
}
bool CUDABlas::DoBlasGemmStridedBatched(
Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
uint64 n, uint64 k, float alpha, const DeviceMemory<Eigen::half> &a,
int lda, int64 stride_a, const DeviceMemory<Eigen::half> &b, int ldb,
int64 stride_b, float beta, DeviceMemory<Eigen::half> *c, int ldc,
int64 stride_c, int batch_count) {
#if CUDA_VERSION >= 9010
int cc_major, cc_minor;
if (stream->parent()->GetDeviceDescription().cuda_compute_capability(
&cc_major, &cc_minor) &&
cc_major >= 5) {
cublasGemmAlgo_t algo =
(cc_major >= 7 ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
#if CUDA_VERSION < 11000
cublasMath_t math_type = CUBLAS_TENSOR_OP_MATH;
#else
cublasMath_t math_type = CUBLAS_DEFAULT_MATH;
#endif
cublasStatus_t (*bgemm)(
cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
const void *, const void *, cudaDataType, int, long long int,
const void *, cudaDataType, int, long long int, const void *, void *,
cudaDataType, int, long long int, int, cudaDataType,
cublasGemmAlgo_t algo) = cublasGemmStridedBatchedEx;
bool ok = DoBlasInternalImpl(
bgemm, stream, true /* = pointer_mode_host */,
true /* = err_on_failure */, math_type, CUDABlasTranspose(transa),
CUDABlasTranspose(transb), m, n, k, &alpha, GpuMemory(a), CUDA_R_16F,
lda, stride_a, GpuMemory(b), CUDA_R_16F, ldb, stride_b, &beta,
GpuMemoryMutable(c), CUDA_R_16F, ldc, stride_c, batch_count, CUDA_R_32F,
algo);
if (ok) {
return true;
}
LOG(ERROR) << "failed BLAS call, see log for details";
return false;
}
#endif
// Either CUDA_VERSION < 9.1 or SM < 5.0. Fall back to a loop.
for (int batch = 0; batch < batch_count; ++batch) {
const auto *a_matrix =
reinterpret_cast<const __half *>(GpuMemory(a) + batch * stride_a);
const auto *b_matrix =
reinterpret_cast<const __half *>(GpuMemory(b) + batch * stride_b);
auto *c_matrix =
reinterpret_cast<__half *>(GpuMemoryMutable(c) + batch * stride_c);
bool ok = DoBlasInternalImpl(
cublasSgemmEx, stream, true /* = pointer_mode_host */,
true /* = err_on_failure= */, CUBLAS_DEFAULT_MATH,
CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
a_matrix, SE_CUDA_DATA_HALF, lda, b_matrix, SE_CUDA_DATA_HALF, ldb,
&beta, c_matrix, SE_CUDA_DATA_HALF, ldc);
if (!ok) {
LOG(ERROR) << "failed BLAS call, see log for details";
return false;
}
}
return true;
}
bool CUDABlas::DoBlasGemmStridedBatched(
Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
uint64 n, uint64 k, float alpha, const DeviceMemory<float> &a, int lda,
int64 stride_a, const DeviceMemory<float> &b, int ldb, int64 stride_b,
float beta, DeviceMemory<float> *c, int ldc, int64 stride_c,
int batch_count) {
#if CUDA_VERSION < 11000
cublasMath_t math_type = CUBLAS_DEFAULT_MATH;
#else
cublasMath_t math_type = CUBLAS_TF32_TENSOR_OP_MATH;
#endif
return DoBlasInternalImpl(
cublasSgemmStridedBatched, stream, true /* = pointer_mode_host */,
true /* = err_on_failure */, math_type, CUDABlasTranspose(transa),
CUDABlasTranspose(transb), m, n, k, &alpha, GpuMemory(a), lda, stride_a,
GpuMemory(b), ldb, stride_b, &beta, GpuMemoryMutable(c), ldc, stride_c,
batch_count);
}
bool CUDABlas::DoBlasGemmStridedBatched(
Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
uint64 n, uint64 k, double alpha, const DeviceMemory<double> &a, int lda,
int64 stride_a, const DeviceMemory<double> &b, int ldb, int64 stride_b,
double beta, DeviceMemory<double> *c, int ldc, int64 stride_c,
int batch_count) {
return DoBlasInternal(
cublasDgemmStridedBatched, stream, true /* = pointer_mode_host */,
CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
GpuMemory(a), lda, stride_a, GpuMemory(b), ldb, stride_b, &beta,
GpuMemoryMutable(c), ldc, stride_c, batch_count);
}
bool CUDABlas::DoBlasGemmStridedBatched(
Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
uint64 n, uint64 k, std::complex<float> alpha,
const DeviceMemory<std::complex<float>> &a, int lda, int64 stride_a,
const DeviceMemory<std::complex<float>> &b, int ldb, int64 stride_b,
std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc,
int64 stride_c, int batch_count) {
auto cb_alpha = GpuComplexValue(alpha);
auto cb_beta = GpuComplexValue(beta);
return DoBlasInternal(
cublasCgemmStridedBatched, stream, true /* = pointer_mode_host */,
CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
GpuComplex(&cb_alpha), GpuComplex(GpuMemory(a)), lda, stride_a,
GpuComplex(GpuMemory(b)), ldb, stride_b, GpuComplex(&cb_beta),
GpuComplex(GpuMemoryMutable(c)), ldc, stride_c, batch_count);
}
bool CUDABlas::DoBlasGemmStridedBatched(
Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
uint64 n, uint64 k, std::complex<double> alpha,
const DeviceMemory<std::complex<double>> &a, int lda, int64 stride_a,
const DeviceMemory<std::complex<double>> &b, int ldb, int64 stride_b,
std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc,
int64 stride_c, int batch_count) {
auto cb_alpha = GpuComplexValue(alpha);
auto cb_beta = GpuComplexValue(beta);
return DoBlasInternal(
cublasZgemmStridedBatched, stream, true /* = pointer_mode_host */,
CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
GpuComplex(&cb_alpha), GpuComplex(GpuMemory(a)), lda, stride_a,
GpuComplex(GpuMemory(b)), ldb, stride_b, GpuComplex(&cb_beta),
GpuComplex(GpuMemoryMutable(c)), ldc, stride_c, batch_count);
}
bool CUDABlas::DoBlasHemm(Stream *stream, blas::Side side,
blas::UpperLower uplo, uint64 m, uint64 n,
std::complex<float> alpha,
const DeviceMemory<std::complex<float>> &a, int lda,
const DeviceMemory<std::complex<float>> &b, int ldb,
std::complex<float> beta,
DeviceMemory<std::complex<float>> *c, int ldc) {
auto cb_alpha = GpuComplexValue(alpha);
auto cb_beta = GpuComplexValue(beta);
return DoBlasInternal(cublasChemm, stream, true /* = pointer_mode_host */,
CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n,
GpuComplex(&cb_alpha), GpuComplex(GpuMemory(a)), lda,
GpuComplex(GpuMemory(b)), ldb, GpuComplex(&cb_beta),
GpuComplex(GpuMemoryMutable(c)), ldc);
}
bool CUDABlas::DoBlasHemm(Stream *stream, blas::Side side,
blas::UpperLower uplo, uint64 m, uint64 n,
std::complex<double> alpha,
const DeviceMemory<std::complex<double>> &a, int lda,
const DeviceMemory<std::complex<double>> &b, int ldb,
std::complex<double> beta,
DeviceMemory<std::complex<double>> *c, int ldc) {
auto cb_alpha = GpuComplexValue(alpha);
auto cb_beta = GpuComplexValue(beta);
return DoBlasInternal(cublasZhemm, stream, true /* = pointer_mode_host */,
CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n,
GpuComplex(&cb_alpha), GpuComplex(GpuMemory(a)), lda,
GpuComplex(GpuMemory(b)), ldb, GpuComplex(&cb_beta),
GpuComplex(GpuMemoryMutable(c)), ldc);
}
bool CUDABlas::DoBlasHerk(Stream *stream, blas::UpperLower uplo,
blas::Transpose trans, uint64 n, uint64 k,
float alpha,
const DeviceMemory<std::complex<float>> &a, int lda,
float beta, DeviceMemory<std::complex<float>> *c,
int ldc) {
return DoBlasInternal(cublasCherk, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
k, &alpha, GpuComplex(GpuMemory(a)), lda, &beta,
GpuComplex(GpuMemoryMutable(c)), ldc);
}
bool CUDABlas::DoBlasHerk(Stream *stream, blas::UpperLower uplo,
blas::Transpose trans, uint64 n, uint64 k,
double alpha,
const DeviceMemory<std::complex<double>> &a, int lda,
double beta, DeviceMemory<std::complex<double>> *c,
int ldc) {
return DoBlasInternal(cublasZherk, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
k, &alpha, GpuComplex(GpuMemory(a)), lda, &beta,
GpuComplex(GpuMemoryMutable(c)), ldc);
}
bool CUDABlas::DoBlasHer2k(Stream *stream, blas::UpperLower uplo,
blas::Transpose trans, uint64 n, uint64 k,
std::complex<float> alpha,
const DeviceMemory<std::complex<float>> &a, int lda,
const DeviceMemory<std::complex<float>> &b, int ldb,
float beta, DeviceMemory<std::complex<float>> *c,
int ldc) {
auto cb_alpha = GpuComplexValue(alpha);
return DoBlasInternal(cublasCher2k, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
k, GpuComplex(&cb_alpha), GpuComplex(GpuMemory(a)), lda,
GpuComplex(GpuMemory(b)), ldb, &beta,
GpuComplex(GpuMemoryMutable(c)), ldc);
}
bool CUDABlas::DoBlasHer2k(Stream *stream, blas::UpperLower uplo,
blas::Transpose trans, uint64 n, uint64 k,
std::complex<double> alpha,
const DeviceMemory<std::complex<double>> &a, int lda,
const DeviceMemory<std::complex<double>> &b, int ldb,
double beta, DeviceMemory<std::complex<double>> *c,
int ldc) {
auto cb_alpha = GpuComplexValue(alpha);
return DoBlasInternal(cublasZher2k, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
k, GpuComplex(&cb_alpha), GpuComplex(GpuMemory(a)), lda,
GpuComplex(GpuMemory(b)), ldb, &beta,
GpuComplex(GpuMemoryMutable(c)), ldc);
}
bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
blas::UpperLower uplo, uint64 m, uint64 n,
float alpha, const DeviceMemory<float> &a, int lda,
const DeviceMemory<float> &b, int ldb, float beta,
DeviceMemory<float> *c, int ldc) {
return DoBlasInternal(cublasSsymm, stream, true /* = pointer_mode_host */,
CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n,
&alpha, GpuMemory(a), lda, GpuMemory(b), ldb, &beta,
GpuMemoryMutable(c), ldc);
}
bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
blas::UpperLower uplo, uint64 m, uint64 n,
double alpha, const DeviceMemory<double> &a, int lda,
const DeviceMemory<double> &b, int ldb, double beta,
DeviceMemory<double> *c, int ldc) {
return DoBlasInternal(cublasDsymm, stream, true /* = pointer_mode_host */,
CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n,
&alpha, GpuMemory(a), lda, GpuMemory(b), ldb, &beta,
GpuMemoryMutable(c), ldc);
}
bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
blas::UpperLower uplo, uint64 m, uint64 n,
std::complex<float> alpha,
const DeviceMemory<std::complex<float>> &a, int lda,
const DeviceMemory<std::complex<float>> &b, int ldb,
std::complex<float> beta,
DeviceMemory<std::complex<float>> *c, int ldc) {
auto cb_alpha = GpuComplexValue(alpha);
auto cb_beta = GpuComplexValue(beta);
return DoBlasInternal(cublasCsymm, stream, true /* = pointer_mode_host */,
CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n,
GpuComplex(&cb_alpha), GpuComplex(GpuMemory(a)), lda,
GpuComplex(GpuMemory(b)), ldb, GpuComplex(&cb_beta),
GpuComplex(GpuMemoryMutable(c)), ldc);
}
bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
blas::UpperLower uplo, uint64 m, uint64 n,
std::complex<double> alpha,
const DeviceMemory<std::complex<double>> &a, int lda,
const DeviceMemory<std::complex<double>> &b, int ldb,
std::complex<double> beta,
DeviceMemory<std::complex<double>> *c, int ldc) {
auto cb_alpha = GpuComplexValue(alpha);
auto cb_beta = GpuComplexValue(beta);
return DoBlasInternal(cublasZsymm, stream, true /* = pointer_mode_host */,
CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n,
GpuComplex(&cb_alpha), GpuComplex(GpuMemory(a)), lda,
GpuComplex(GpuMemory(b)), ldb, GpuComplex(&cb_beta),
GpuComplex(GpuMemoryMutable(c)), ldc);
}
bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
blas::Transpose trans, uint64 n, uint64 k,
float alpha, const DeviceMemory<float> &a, int lda,
float beta, DeviceMemory<float> *c, int ldc) {
return DoBlasInternal(cublasSsyrk, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
k, &alpha, GpuMemory(a), lda, &beta,
GpuMemoryMutable(c), ldc);
}
bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
blas::Transpose trans, uint64 n, uint64 k,
double alpha, const DeviceMemory<double> &a, int lda,
double beta, DeviceMemory<double> *c, int ldc) {
return DoBlasInternal(cublasDsyrk, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
k, &alpha, GpuMemory(a), lda, &beta,
GpuMemoryMutable(c), ldc);
}
bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
blas::Transpose trans, uint64 n, uint64 k,
std::complex<float> alpha,
const DeviceMemory<std::complex<float>> &a, int lda,
std::complex<float> beta,
DeviceMemory<std::complex<float>> *c, int ldc) {
auto cb_alpha = GpuComplexValue(alpha);
auto cb_beta = GpuComplexValue(beta);
return DoBlasInternal(cublasCsyrk, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
k, GpuComplex(&cb_alpha), GpuComplex(GpuMemory(a)), lda,
GpuComplex(&cb_beta), GpuComplex(GpuMemoryMutable(c)),
ldc);
}
bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
blas::Transpose trans, uint64 n, uint64 k,
std::complex<double> alpha,
const DeviceMemory<std::complex<double>> &a, int lda,
std::complex<double> beta,
DeviceMemory<std::complex<double>> *c, int ldc) {
auto cb_alpha = GpuComplexValue(alpha);
auto cb_beta = GpuComplexValue(beta);
return DoBlasInternal(cublasZsyrk, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
k, GpuComplex(&cb_alpha), GpuComplex(GpuMemory(a)), lda,
GpuComplex(&cb_beta), GpuComplex(GpuMemoryMutable(c)),
ldc);
}
bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
blas::Transpose trans, uint64 n, uint64 k,
float alpha, const DeviceMemory<float> &a, int lda,
const DeviceMemory<float> &b, int ldb, float beta,
DeviceMemory<float> *c, int ldc) {
return DoBlasInternal(cublasSsyr2k, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
k, &alpha, GpuMemory(a), lda, GpuMemory(b), ldb, &beta,
GpuMemoryMutable(c), ldc);
}
bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
blas::Transpose trans, uint64 n, uint64 k,
double alpha, const DeviceMemory<double> &a, int lda,
const DeviceMemory<double> &b, int ldb, double beta,
DeviceMemory<double> *c, int ldc) {
return DoBlasInternal(cublasDsyr2k, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
k, &alpha, GpuMemory(a), lda, GpuMemory(b), ldb, &beta,
GpuMemoryMutable(c), ldc);
}
bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
blas::Transpose trans, uint64 n, uint64 k,
std::complex<float> alpha,
const DeviceMemory<std::complex<float>> &a, int lda,
const DeviceMemory<std::complex<float>> &b, int ldb,
std::complex<float> beta,
DeviceMemory<std::complex<float>> *c, int ldc) {
auto cb_alpha = GpuComplexValue(alpha);
auto cb_beta = GpuComplexValue(beta);
return DoBlasInternal(cublasCsyr2k, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
k, GpuComplex(&cb_alpha), GpuComplex(GpuMemory(a)), lda,
GpuComplex(GpuMemory(b)), ldb, GpuComplex(&cb_beta),
GpuComplex(GpuMemoryMutable(c)), ldc);
}
bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
blas::Transpose trans, uint64 n, uint64 k,
std::complex<double> alpha,
const DeviceMemory<std::complex<double>> &a, int lda,
const DeviceMemory<std::complex<double>> &b, int ldb,
std::complex<double> beta,
DeviceMemory<std::complex<double>> *c, int ldc) {
auto cb_alpha = GpuComplexValue(alpha);
auto cb_beta = GpuComplexValue(beta);
return DoBlasInternal(cublasZsyr2k, stream, true /* = pointer_mode_host */,
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
k, GpuComplex(&cb_alpha), GpuComplex(GpuMemory(a)), lda,
GpuComplex(GpuMemory(b)), ldb, GpuComplex(&cb_beta),
GpuComplex(GpuMemoryMutable(c)), ldc);
}
bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
blas::UpperLower uplo, blas::Transpose transa,
blas::Diagonal diag, uint64 m, uint64 n, float alpha,
const DeviceMemory<float> &a, int lda,
DeviceMemory<float> *b, int ldb) {
return DoBlasInternal(cublasStrmm, stream, true /* = pointer_mode_host */,
CUDABlasSide(side), CUDABlasUpperLower(uplo),
CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n,
&alpha, GpuMemory(a), lda, GpuMemoryMutable(b), ldb,
GpuMemoryMutable(b), ldb);
}
bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
blas::UpperLower uplo, blas::Transpose transa,
blas::Diagonal diag, uint64 m, uint64 n, double alpha,
const DeviceMemory<double> &a, int lda,
DeviceMemory<double> *b, int ldb) {
return DoBlasInternal(cublasDtrmm, stream, true /* = pointer_mode_host */,
CUDABlasSide(side), CUDABlasUpperLower(uplo),
CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n,
&alpha, GpuMemory(a), lda, GpuMemoryMutable(b), ldb,
GpuMemoryMutable(b), ldb);
}
bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
blas::UpperLower uplo, blas::Transpose transa,
blas::Diagonal diag, uint64 m, uint64 n,
std::complex<float> alpha,
const DeviceMemory<std::complex<float>> &a, int lda,
DeviceMemory<std::complex<float>> *b, int ldb) {
auto cb_alpha = GpuComplexValue(alpha);
return DoBlasInternal(cublasCtrmm, stream, true /* = pointer_mode_host */,
CUDABlasSide(side), CUDABlasUpperLower(uplo),
CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n,
GpuComplex(&cb_alpha), GpuComplex(GpuMemory(a)), lda,
GpuComplex(GpuMemoryMutable(b)), ldb,
GpuComplex(GpuMemoryMutable(b)), ldb);
}
bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
blas::UpperLower uplo, blas::Transpose transa,
blas::Diagonal diag, uint64 m, uint64 n,
std::complex<double> alpha,
const DeviceMemory<std::complex<double>> &a, int lda,
DeviceMemory<std::complex<double>> *b, int ldb) {
auto cb_alpha = GpuComplexValue(alpha);
return DoBlasInternal(cublasZtrmm, stream, true /* = pointer_mode_host */,
CUDABlasSide(side), CUDABlasUpperLower(uplo),
CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n,
GpuComplex(&cb_alpha), GpuComplex(GpuMemory(a)), lda,
GpuComplex(GpuMemoryMutable(b)), ldb,
GpuComplex(GpuMemoryMutable(b)), ldb);
}
bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
blas::UpperLower uplo, blas::Transpose transa,
blas::Diagonal diag, uint64 m, uint64 n, float alpha,
const DeviceMemory<float> &a, int lda,
DeviceMemory<float> *b, int ldb) {
return DoBlasInternal(cublasStrsm, stream, true /* = pointer_mode_host */,
CUDABlasSide(side), CUDABlasUpperLower(uplo),
CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n,
&alpha, GpuMemory(a), lda, GpuMemoryMutable(b), ldb);
}
bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
blas::UpperLower uplo, blas::Transpose transa,
blas::Diagonal diag, uint64 m, uint64 n, double alpha,
const DeviceMemory<double> &a, int lda,
DeviceMemory<double> *b, int ldb) {
return DoBlasInternal(cublasDtrsm, stream, true /* = pointer_mode_host */,
CUDABlasSide(side), CUDABlasUpperLower(uplo),
CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n,
&alpha, GpuMemory(a), lda, GpuMemoryMutable(b), ldb);
}
bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
blas::UpperLower uplo, blas::Transpose transa,
blas::Diagonal diag, uint64 m, uint64 n,
std::complex<float> alpha,
const DeviceMemory<std::complex<float>> &a, int lda,
DeviceMemory<std::complex<float>> *b, int ldb) {
auto cb_alpha = GpuComplexValue(alpha);
return DoBlasInternal(cublasCtrsm, stream, true /* = pointer_mode_host */,
CUDABlasSide(side), CUDABlasUpperLower(uplo),
CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n,
GpuComplex(&cb_alpha), GpuComplex(GpuMemory(a)), lda,
GpuComplex(GpuMemoryMutable(b)), ldb);
}
bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
blas::UpperLower uplo, blas::Transpose transa,
blas::Diagonal diag, uint64 m, uint64 n,
std::complex<double> alpha,
const DeviceMemory<std::complex<double>> &a, int lda,
DeviceMemory<std::complex<double>> *b, int ldb) {
auto cb_alpha = GpuComplexValue(alpha);
return DoBlasInternal(cublasZtrsm, stream, true /* = pointer_mode_host */,
CUDABlasSide(side), CUDABlasUpperLower(uplo),
CUDABlasTranspose(transa), CUDABlasDiagonal(diag), m, n,
GpuComplex(&cb_alpha), GpuComplex(GpuMemory(a)), lda,
GpuComplex(GpuMemoryMutable(b)), ldb);
}
port::Status CUDABlas::GetVersion(string *version) {
absl::MutexLock lock(&mu_);
int v;
auto status = cublasGetVersion(blas_, &v);
if (status != CUBLAS_STATUS_SUCCESS) {
return port::InternalError(ToString(status));
}
*version = std::to_string(v);
return port::Status::OK();
}
} // namespace gpu