void ComputeSmall()

in tensorflow_quantum/core/ops/math_ops/tfq_inner_product_grad.cc [312:448]


  void ComputeSmall(
      const std::vector<int>& num_qubits, const int max_num_qubits,
      const std::vector<SymbolMap>& maps,
      const std::vector<QsimCircuit>& qsim_circuits,
      const std::vector<QsimFusedCircuit>& fused_circuits,
      const std::vector<std::vector<std::vector<qsim::GateFused<QsimGate>>>>&
          partial_fused_circuits,
      const std::vector<std::vector<tfq::GradientOfGate>>& gradient_gates,
      const std::vector<std::vector<QsimFusedCircuit>>& other_fused_circuits,
      const std::vector<std::vector<float>>& downstream_grads,
      tensorflow::OpKernelContext* context,
      tensorflow::TTypes<std::complex<float>>::Matrix* output_tensor) {
    const auto tfq_for = qsim::SequentialFor(1);
    using Simulator = qsim::Simulator<const qsim::SequentialFor&>;
    using StateSpace = Simulator::StateSpace;

    const int output_dim_internal_size = other_fused_circuits[0].size();

    auto DoWork = [&](int start, int end) {
      int old_batch_index = -2;
      int cur_batch_index = -1;
      int largest_nq = 1;
      int cur_internal_index;

      Simulator sim = Simulator(tfq_for);
      StateSpace ss = StateSpace(tfq_for);
      auto sv = ss.Create(largest_nq);
      auto sv_adj = ss.Create(largest_nq);
      auto scratch = ss.Create(largest_nq);
      auto scratch2 = ss.Create(largest_nq);
      for (int i = start; i < end; i++) {
        cur_batch_index = i / output_dim_internal_size;
        cur_internal_index = i % output_dim_internal_size;

        const int nq = num_qubits[cur_batch_index];

        if (cur_batch_index != old_batch_index) {
          // We've run into a new state vector we must compute.
          // Only compute a new state vector when we have to.
          if (nq > largest_nq) {
            largest_nq = nq;
            sv = ss.Create(largest_nq);
            sv_adj = ss.Create(largest_nq);
            scratch = ss.Create(largest_nq);
            scratch2 = ss.Create(largest_nq);
          }
          ss.SetStateZero(sv);
          for (std::vector<qsim::GateFused<QsimGate>>::size_type j = 0;
               j < fused_circuits[cur_batch_index].size(); j++) {
            qsim::ApplyFusedGate(sim, fused_circuits[cur_batch_index][j], sv);
          }
        }

        ss.SetStateZero(scratch);
        for (std::vector<qsim::GateFused<QsimGate>>::size_type k = 0;
             k <
             other_fused_circuits[cur_batch_index][cur_internal_index].size();
             k++) {
          qsim::ApplyFusedGate(
              sim, other_fused_circuits[cur_batch_index][cur_internal_index][k],
              scratch);
        }
        // now sv is |psi>, scratch is |phi>
        // Start adjoint differentiation.
        ss.Copy(sv, sv_adj);
        for (int l = partial_fused_circuits[cur_batch_index].size() - 1; l >= 0;
             l--) {
          for (int k = partial_fused_circuits[cur_batch_index][l].size() - 1;
               k >= 0; k--) {
            ApplyFusedGateDagger(
                sim, partial_fused_circuits[cur_batch_index][l][k], sv_adj);
            ApplyFusedGateDagger(
                sim, partial_fused_circuits[cur_batch_index][l][k], scratch);
          }
          if (l == 0) {
            // last layer will have no parametrized gates so can break.
            break;
          }

          // Hit a parameterized gate.
          // todo fix this copy.
          auto cur_gate =
              qsim_circuits[cur_batch_index]
                  .gates[gradient_gates[cur_batch_index][l - 1].index];
          ApplyGateDagger(sim, cur_gate, sv_adj);

          // if applicable compute control qubit mask and control value bits.
          uint64_t mask = 0;
          uint64_t cbits = 0;
          for (int k = 0; k < cur_gate.controlled_by.size(); k++) {
            uint64_t control_loc = cur_gate.controlled_by[k];
            mask |= uint64_t{1} << control_loc;
            cbits |= ((cur_gate.cmask >> k) & 1) << control_loc;
          }

          for (int k = 0;
               k < gradient_gates[cur_batch_index][l - 1].grad_gates.size();
               k++) {
            // Copy sv_adj onto scratch2 in anticipation of non-unitary
            // "gradient gate".
            ss.Copy(sv_adj, scratch2);
            if (!cur_gate.controlled_by.empty()) {
              // Gradient of controlled gates puts zeros on diagonal which is
              // the same as collapsing the state and then applying the
              // non-controlled version of the gradient gate.
              ss.BulkSetAmpl(scratch2, mask, cbits, 0, 0, true);
            }
            qsim::ApplyGate(
                sim, gradient_gates[cur_batch_index][l - 1].grad_gates[k],
                scratch2);

            // don't need not-found check since this is done upstream already.
            const auto it = maps[cur_batch_index].find(
                gradient_gates[cur_batch_index][l - 1].params[k]);
            const int loc = it->second.first;
            // Apply finite differencing for adjoint gradients.
            // Finite differencing enables applying multiple `gradient_gate`
            // of a symbol at the same circuit. For analytic methods like
            // parameter-shift we need to apply a single `gradient_gate`
            // per a symbol.
            std::complex<double> result = ss.InnerProduct(scratch2, scratch);
            (*output_tensor)(cur_batch_index, loc) +=
                (downstream_grads[cur_batch_index][cur_internal_index] *
                 std::complex<float>(static_cast<float>(result.real()),
                                     static_cast<float>(result.imag())));
          }
          ApplyGateDagger(sim, cur_gate, scratch);
        }
        old_batch_index = cur_batch_index;
      }
    };

    const int64_t num_cycles =
        200 * (int64_t(1) << static_cast<int64_t>(max_num_qubits));
    context->device()->tensorflow_cpu_worker_threads()->workers->ParallelFor(
        fused_circuits.size() * output_dim_internal_size, num_cycles, DoWork);
  }