in tensorflow_quantum/core/ops/noise/tfq_noisy_samples.cc [205:318]
void ComputeSmall(const std::vector<int>& num_qubits,
const int max_num_qubits, const int num_samples,
const std::vector<NoisyQsimCircuit>& ncircuits,
tensorflow::OpKernelContext* context,
tensorflow::TTypes<int8_t, 3>::Tensor* output_tensor) {
using Simulator = qsim::Simulator<const qsim::SequentialFor&>;
using StateSpace = Simulator::StateSpace;
using QTSimulator =
qsim::QuantumTrajectorySimulator<qsim::IO, QsimGate,
qsim::MultiQubitGateFuser, Simulator>;
const int output_dim_batch_size = output_tensor->dimension(0);
const int num_threads = context->device()
->tensorflow_cpu_worker_threads()
->workers->NumThreads();
// [num_threads, batch_size].
std::vector<std::vector<int>> rep_offsets(
num_threads, std::vector<int>(output_dim_batch_size, 0));
BalanceTrajectory(num_samples, num_threads, &rep_offsets);
// [num_threads, batch_size] stores the number of
// samples written by thread range [0, i].
std::vector<std::vector<long>> offset_prefix_sum(
num_threads, std::vector<long>(output_dim_batch_size, 0));
for (int i = 0; i < output_dim_batch_size; i++) {
int p_reps = (num_samples + num_threads - 1) / num_threads;
offset_prefix_sum[0][i] = rep_offsets[0][i] + p_reps;
for (int j = 1; j < num_threads; j++) {
offset_prefix_sum[j][i] += offset_prefix_sum[j - 1][i];
offset_prefix_sum[j][i] += rep_offsets[j][i] + p_reps;
}
}
tensorflow::GuardedPhiloxRandom random_gen;
random_gen.Init(tensorflow::random::New64(), tensorflow::random::New64());
auto DoWork = [&](int start, int end) {
// Begin simulation.
const auto tfq_for = qsim::SequentialFor(1);
int largest_nq = 1;
Simulator sim = Simulator(tfq_for);
StateSpace ss = StateSpace(tfq_for);
auto sv = ss.Create(largest_nq);
auto scratch = ss.Create(largest_nq);
int needed_random =
4 * (num_samples * ncircuits.size() + num_threads) / num_threads;
needed_random += 4;
auto local_gen = random_gen.ReserveSamples32(needed_random);
tensorflow::random::SimplePhilox rand_source(&local_gen);
for (int i = 0; i < ncircuits.size(); i++) {
int nq = num_qubits[i];
int j = start > 0 ? offset_prefix_sum[start - 1][i] : 0;
int needed_samples = offset_prefix_sum[start][i] - j;
if (needed_samples <= 0) {
continue;
}
if (nq > largest_nq) {
largest_nq = nq;
sv = ss.Create(largest_nq);
scratch = ss.Create(largest_nq);
}
QTSimulator::Parameter param;
param.collect_kop_stat = false;
param.collect_mea_stat = true;
param.normalize_before_mea_gates = true;
// Track op-wise stats.
std::vector<uint64_t> gathered_samples;
int run_samples = 0;
while (1) {
ss.SetStateZero(sv);
QTSimulator::RunOnce(param, ncircuits[i], rand_source.Rand64(), ss,
sim, scratch, sv, gathered_samples);
uint64_t q_ind = 0;
uint64_t mask = 1;
bool val = 0;
while (q_ind < nq) {
val = gathered_samples[0] & mask;
(*output_tensor)(
i, j, static_cast<ptrdiff_t>(max_num_qubits - q_ind - 1)) = val;
q_ind++;
mask <<= 1;
}
while (q_ind < max_num_qubits) {
(*output_tensor)(
i, j, static_cast<ptrdiff_t>(max_num_qubits - q_ind - 1)) = -2;
q_ind++;
}
j++;
run_samples++;
// Check if we have gathered enough samples.
if (run_samples >= needed_samples) {
break;
}
}
}
};
// block_size = 1.
tensorflow::thread::ThreadPool::SchedulingParams scheduling_params(
tensorflow::thread::ThreadPool::SchedulingStrategy::kFixedBlockSize,
absl::nullopt, 1);
context->device()->tensorflow_cpu_worker_threads()->workers->ParallelFor(
num_threads, scheduling_params, DoWork);
}