in tensorflow_quantum/core/ops/math_ops/tfq_inner_product_grad.cc [312:448]
void ComputeSmall(
const std::vector<int>& num_qubits, const int max_num_qubits,
const std::vector<SymbolMap>& maps,
const std::vector<QsimCircuit>& qsim_circuits,
const std::vector<QsimFusedCircuit>& fused_circuits,
const std::vector<std::vector<std::vector<qsim::GateFused<QsimGate>>>>&
partial_fused_circuits,
const std::vector<std::vector<tfq::GradientOfGate>>& gradient_gates,
const std::vector<std::vector<QsimFusedCircuit>>& other_fused_circuits,
const std::vector<std::vector<float>>& downstream_grads,
tensorflow::OpKernelContext* context,
tensorflow::TTypes<std::complex<float>>::Matrix* output_tensor) {
const auto tfq_for = qsim::SequentialFor(1);
using Simulator = qsim::Simulator<const qsim::SequentialFor&>;
using StateSpace = Simulator::StateSpace;
const int output_dim_internal_size = other_fused_circuits[0].size();
auto DoWork = [&](int start, int end) {
int old_batch_index = -2;
int cur_batch_index = -1;
int largest_nq = 1;
int cur_internal_index;
Simulator sim = Simulator(tfq_for);
StateSpace ss = StateSpace(tfq_for);
auto sv = ss.Create(largest_nq);
auto sv_adj = ss.Create(largest_nq);
auto scratch = ss.Create(largest_nq);
auto scratch2 = ss.Create(largest_nq);
for (int i = start; i < end; i++) {
cur_batch_index = i / output_dim_internal_size;
cur_internal_index = i % output_dim_internal_size;
const int nq = num_qubits[cur_batch_index];
if (cur_batch_index != old_batch_index) {
// We've run into a new state vector we must compute.
// Only compute a new state vector when we have to.
if (nq > largest_nq) {
largest_nq = nq;
sv = ss.Create(largest_nq);
sv_adj = ss.Create(largest_nq);
scratch = ss.Create(largest_nq);
scratch2 = ss.Create(largest_nq);
}
ss.SetStateZero(sv);
for (std::vector<qsim::GateFused<QsimGate>>::size_type j = 0;
j < fused_circuits[cur_batch_index].size(); j++) {
qsim::ApplyFusedGate(sim, fused_circuits[cur_batch_index][j], sv);
}
}
ss.SetStateZero(scratch);
for (std::vector<qsim::GateFused<QsimGate>>::size_type k = 0;
k <
other_fused_circuits[cur_batch_index][cur_internal_index].size();
k++) {
qsim::ApplyFusedGate(
sim, other_fused_circuits[cur_batch_index][cur_internal_index][k],
scratch);
}
// now sv is |psi>, scratch is |phi>
// Start adjoint differentiation.
ss.Copy(sv, sv_adj);
for (int l = partial_fused_circuits[cur_batch_index].size() - 1; l >= 0;
l--) {
for (int k = partial_fused_circuits[cur_batch_index][l].size() - 1;
k >= 0; k--) {
ApplyFusedGateDagger(
sim, partial_fused_circuits[cur_batch_index][l][k], sv_adj);
ApplyFusedGateDagger(
sim, partial_fused_circuits[cur_batch_index][l][k], scratch);
}
if (l == 0) {
// last layer will have no parametrized gates so can break.
break;
}
// Hit a parameterized gate.
// todo fix this copy.
auto cur_gate =
qsim_circuits[cur_batch_index]
.gates[gradient_gates[cur_batch_index][l - 1].index];
ApplyGateDagger(sim, cur_gate, sv_adj);
// if applicable compute control qubit mask and control value bits.
uint64_t mask = 0;
uint64_t cbits = 0;
for (int k = 0; k < cur_gate.controlled_by.size(); k++) {
uint64_t control_loc = cur_gate.controlled_by[k];
mask |= uint64_t{1} << control_loc;
cbits |= ((cur_gate.cmask >> k) & 1) << control_loc;
}
for (int k = 0;
k < gradient_gates[cur_batch_index][l - 1].grad_gates.size();
k++) {
// Copy sv_adj onto scratch2 in anticipation of non-unitary
// "gradient gate".
ss.Copy(sv_adj, scratch2);
if (!cur_gate.controlled_by.empty()) {
// Gradient of controlled gates puts zeros on diagonal which is
// the same as collapsing the state and then applying the
// non-controlled version of the gradient gate.
ss.BulkSetAmpl(scratch2, mask, cbits, 0, 0, true);
}
qsim::ApplyGate(
sim, gradient_gates[cur_batch_index][l - 1].grad_gates[k],
scratch2);
// don't need not-found check since this is done upstream already.
const auto it = maps[cur_batch_index].find(
gradient_gates[cur_batch_index][l - 1].params[k]);
const int loc = it->second.first;
// Apply finite differencing for adjoint gradients.
// Finite differencing enables applying multiple `gradient_gate`
// of a symbol at the same circuit. For analytic methods like
// parameter-shift we need to apply a single `gradient_gate`
// per a symbol.
std::complex<double> result = ss.InnerProduct(scratch2, scratch);
(*output_tensor)(cur_batch_index, loc) +=
(downstream_grads[cur_batch_index][cur_internal_index] *
std::complex<float>(static_cast<float>(result.real()),
static_cast<float>(result.imag())));
}
ApplyGateDagger(sim, cur_gate, scratch);
}
old_batch_index = cur_batch_index;
}
};
const int64_t num_cycles =
200 * (int64_t(1) << static_cast<int64_t>(max_num_qubits));
context->device()->tensorflow_cpu_worker_threads()->workers->ParallelFor(
fused_circuits.size() * output_dim_internal_size, num_cycles, DoWork);
}