query/utils.cu (206 lines of code) (raw):
// Copyright (c) 2017-2018 Uber Technologies, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "query/utils.hpp"
#include <cuda_runtime.h>
#include <iostream>
#include <string>
const int MAX_CUDA_ERROR_LEN = 80;
uint16_t DAYS_BEFORE_MONTH_HOST[13] = {
0,
31,
31 + 28,
31 + 28 + 31,
31 + 28 + 31 + 30,
31 + 28 + 31 + 30 + 31,
31 + 28 + 31 + 30 + 31 + 30,
31 + 28 + 31 + 30 + 31 + 30 + 31,
31 + 28 + 31 + 30 + 31 + 30 + 31 + 31,
31 + 28 + 31 + 30 + 31 + 30 + 31 + 31 + 30,
31 + 28 + 31 + 30 + 31 + 30 + 31 + 31 + 30 + 31,
31 + 28 + 31 + 30 + 31 + 30 + 31 + 31 + 30 + 31 + 30,
31 + 28 + 31 + 30 + 31 + 30 + 31 + 31 + 30 + 31 + 30 + 31,
};
__constant__ uint16_t
DAYS_BEFORE_MONTH_DEVICE[13];
// CheckCUDAError implementation. Notes for host we don't throw the exception
// on purpose since we will always receive error messages like "insufficient
// driver version".
void CheckCUDAError(const char *message) {
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) {
char buf[MAX_CUDA_ERROR_LEN];
snprintf(buf,
sizeof(buf),
"ERROR: %s: %s",
message,
cudaGetErrorString(error));
#ifdef RUN_ON_DEVICE
throw AlgorithmError(buf);
#else
printf("%s\n", buf);
#endif
}
}
// BootstrapDevice implementation. It should be only called by unit tests
// and golang code.
CGoCallResHandle BootstrapDevice() {
CGoCallResHandle resHandle = {nullptr, nullptr};
try {
#ifdef RUN_ON_DEVICE
int deviceCount;
cudaGetDeviceCount(&deviceCount);
CheckCUDAError("cudaGetDeviceCount");
for (int device = 0; device < deviceCount; device++) {
cudaSetDevice(device);
CheckCUDAError("cudaSetDevice");
cudaMemcpyToSymbol(DAYS_BEFORE_MONTH_DEVICE, DAYS_BEFORE_MONTH_HOST,
sizeof(DAYS_BEFORE_MONTH_HOST));
CheckCUDAError("cudaMemcpyToSymbol");
}
#endif
}
catch (std::exception &e) {
std::cerr << "Exception happened when bootstraping device:" << e.what()
<< std::endl;
resHandle.pStrErr = strdup(e.what());
}
return resHandle;
}
const char *AlgorithmError::what() const throw() {
return message_.c_str();
}
AlgorithmError::AlgorithmError(const std::string &message) {
message_ = message;
}
namespace ares {
__host__ __device__
uint64_t rotl64(uint64_t x, int8_t r) {
return (x << r) | (x >> (64 - r));
}
__host__ __device__
uint64_t fmix64(uint64_t k) {
k ^= k >> 33;
k *= 0xff51afd7ed558ccdLLU;
k ^= k >> 33;
k *= 0xc4ceb9fe1a85ec53LLU;
k ^= k >> 33;
return k;
}
// Murmur3Sum32 implements Murmur3Sum32 hash algorithm.
__host__ __device__
uint32_t murmur3sum32(const uint8_t *key, int bytes,
uint32_t seed) {
uint32_t h1 = seed;
int nBlocks = bytes / 4;
const uint8_t *p = key;
const uint8_t *p1 = p + 4 * nBlocks;
for (; p < p1; p += 4) {
uint32_t k1 = *reinterpret_cast<const uint32_t *>(p);
k1 *= 0xcc9e2d51;
k1 = (k1 << 15) | (k1 >> 17);
k1 *= 0x1b873593;
h1 ^= k1;
h1 = (h1 << 13) | (h1 >> 19);
h1 = h1 * 5 + 0xe6546b64;
}
int tailBytes = bytes - nBlocks * 4;
const uint8_t *tail = p1;
uint32_t k1 = 0;
switch (tailBytes & 3) {
case 3:k1 ^= (uint32_t) tail[2] << 16;
case 2:k1 ^= (uint32_t) tail[1] << 8;
case 1:k1 ^= (uint32_t) tail[0];
k1 *= 0xcc9e2d51;
k1 = (k1 << 15) | (k1 >> 17);
k1 *= 0x1b873593;
h1 ^= k1;
break;
}
h1 ^= bytes;
h1 ^= h1 >> 16;
h1 *= 0x85ebca6b;
h1 ^= h1 >> 13;
h1 *= 0xc2b2ae35;
h1 ^= h1 >> 16;
return h1;
}
__host__ __device__
void murmur3sum128(const uint8_t *key, int len,
uint32_t seed, uint64_t *out) {
const uint8_t *data = key;
const int nblocks = len / 16;
int i;
uint64_t h1 = seed;
uint64_t h2 = seed;
uint64_t c1 = 0x87c37b91114253d5LLU;
uint64_t c2 = 0x4cf5ad432745937fLLU;
const uint64_t *blocks = reinterpret_cast<const uint64_t *>(data);
for (i = 0; i < nblocks; i++) {
uint64_t k1 = blocks[i * 2];
uint64_t k2 = blocks[i * 2 + 1];
k1 *= c1;
k1 = rotl64(k1, 31);
k1 *= c2;
h1 ^= k1;
h1 = rotl64(h1, 27);
h1 += h2;
h1 = h1 * 5 + 0x52dce729;
k2 *= c2;
k2 = rotl64(k2, 33);
k2 *= c1;
h2 ^= k2;
h2 = rotl64(h2, 31);
h2 += h1;
h2 = h2 * 5 + 0x38495ab5;
}
const uint8_t *tail = reinterpret_cast<const uint8_t *>(data + nblocks * 16);
uint64_t k1 = 0;
uint64_t k2 = 0;
switch (len & 15) {
case 15:k2 ^= (uint64_t)(tail[14]) << 48;
case 14:k2 ^= (uint64_t)(tail[13]) << 40;
case 13:k2 ^= (uint64_t)(tail[12]) << 32;
case 12:k2 ^= (uint64_t)(tail[11]) << 24;
case 11:k2 ^= (uint64_t)(tail[10]) << 16;
case 10:k2 ^= (uint64_t)(tail[9]) << 8;
case 9:k2 ^= (uint64_t)(tail[8]) << 0;
k2 *= c2;
k2 = rotl64(k2, 33);
k2 *= c1;
h2 ^= k2;
case 8:k1 ^= (uint64_t)(tail[7]) << 56;
case 7:k1 ^= (uint64_t)(tail[6]) << 48;
case 6:k1 ^= (uint64_t)(tail[5]) << 40;
case 5:k1 ^= (uint64_t)(tail[4]) << 32;
case 4:k1 ^= (uint64_t)(tail[3]) << 24;
case 3:k1 ^= (uint64_t)(tail[2]) << 16;
case 2:k1 ^= (uint64_t)(tail[1]) << 8;
case 1:k1 ^= (uint64_t)(tail[0]) << 0;
k1 *= c1;
k1 = rotl64(k1, 31);
k1 *= c2;
h1 ^= k1;
}
h1 ^= len;
h2 ^= len;
h1 += h2;
h2 += h1;
h1 = fmix64(h1);
h2 = fmix64(h2);
h1 += h2;
h2 += h1;
out[0] = h1;
out[1] = h2;
}
} // namespace ares