in src/core/tensor/tensor.cc [646:718]
void RepeatDataToFrom(bool broadcast_flag, const vector<size_t> &repeats,
int axis, Tensor *dst, const Tensor &src,
const size_t num) {
if (repeats.size() == 1) {
broadcast_flag = true;
} else if (repeats.size() > 1) {
if (axis == Noaxis) {
LOG(FATAL) << "When repeats parameter is sequence, axis cannot be None";
}
}
for (size_t i = 0; i < repeats.size(); i++) {
CHECK_GE(repeats[i], 0);
}
auto width = SizeOf(src.data_type());
CHECK_EQ(width, SizeOf(dst->data_type()));
// size_t nBytes = num * width;
int chunk = width;
int axis_shape = 1;
int shape_outer = 1;
if (axis == Noaxis) {
axis_shape = 1;
shape_outer = Product(src.shape());
} else {
for (int i = 0; i < axis; i++) {
shape_outer *= src.shape()[i];
}
axis_shape = src.shape()[axis];
for (int i = axis + 1; i < static_cast<int>(src.nDim()); i++) {
chunk *= src.shape()[i];
}
}
Device *dev = nullptr;
CopyDirection direct;
std::shared_ptr<Device> src_dev = src.device(), dst_dev = dst->device();
if (dst_dev->lang() != src_dev->lang()) {
// let the none cpp device conduct copy op
if (dst_dev->lang() == kCpp) {
dev = src_dev.get();
direct = kDeviceToHost;
} else if (src_dev->lang() == kCpp) {
dev = dst_dev.get();
direct = kHostToDevice;
} else {
LOG(FATAL)
<< "Not support mem repeat copy between Cuda and OpenCL device";
}
} else {
dev = src_dev.get();
direct = src_dev->lang() == kCpp ? kHostToHost : kDeviceToDevice;
}
int dst_offset = 0;
int src_offset = 0;
Tensor &dstRef = *dst;
for (int i = 0; i < shape_outer; i++) {
for (int j = 0; j < axis_shape; j++) {
int temp = broadcast_flag ? repeats[0] : repeats[j];
for (int k = 0; k < temp; k++) {
dev->Exec(
[dev, dstRef, src, chunk, direct, dst_offset,
src_offset](Context *ctx) mutable {
Block *from = src.block(), *to = dstRef.block();
dev->CopyDataToFrom(to, from, chunk, direct, dst_offset,
src_offset, ctx);
},
{src.block()}, {dst->block()}, "CopyDataToFrom");
dst_offset += chunk;
}
src_offset += chunk;
}
}
}