void ComputeSmall()

in tensorflow_quantum/core/ops/noise/tfq_noisy_samples.cc [205:318]


  void ComputeSmall(const std::vector<int>& num_qubits,
                    const int max_num_qubits, const int num_samples,
                    const std::vector<NoisyQsimCircuit>& ncircuits,
                    tensorflow::OpKernelContext* context,
                    tensorflow::TTypes<int8_t, 3>::Tensor* output_tensor) {
    using Simulator = qsim::Simulator<const qsim::SequentialFor&>;
    using StateSpace = Simulator::StateSpace;
    using QTSimulator =
        qsim::QuantumTrajectorySimulator<qsim::IO, QsimGate,
                                         qsim::MultiQubitGateFuser, Simulator>;

    const int output_dim_batch_size = output_tensor->dimension(0);
    const int num_threads = context->device()
                                ->tensorflow_cpu_worker_threads()
                                ->workers->NumThreads();

    // [num_threads, batch_size].
    std::vector<std::vector<int>> rep_offsets(
        num_threads, std::vector<int>(output_dim_batch_size, 0));
    BalanceTrajectory(num_samples, num_threads, &rep_offsets);

    // [num_threads, batch_size] stores the number of
    // samples written by thread range [0, i].
    std::vector<std::vector<long>> offset_prefix_sum(
        num_threads, std::vector<long>(output_dim_batch_size, 0));

    for (int i = 0; i < output_dim_batch_size; i++) {
      int p_reps = (num_samples + num_threads - 1) / num_threads;
      offset_prefix_sum[0][i] = rep_offsets[0][i] + p_reps;
      for (int j = 1; j < num_threads; j++) {
        offset_prefix_sum[j][i] += offset_prefix_sum[j - 1][i];
        offset_prefix_sum[j][i] += rep_offsets[j][i] + p_reps;
      }
    }

    tensorflow::GuardedPhiloxRandom random_gen;
    random_gen.Init(tensorflow::random::New64(), tensorflow::random::New64());

    auto DoWork = [&](int start, int end) {
      // Begin simulation.
      const auto tfq_for = qsim::SequentialFor(1);
      int largest_nq = 1;
      Simulator sim = Simulator(tfq_for);
      StateSpace ss = StateSpace(tfq_for);
      auto sv = ss.Create(largest_nq);
      auto scratch = ss.Create(largest_nq);

      int needed_random =
          4 * (num_samples * ncircuits.size() + num_threads) / num_threads;
      needed_random += 4;
      auto local_gen = random_gen.ReserveSamples32(needed_random);
      tensorflow::random::SimplePhilox rand_source(&local_gen);

      for (int i = 0; i < ncircuits.size(); i++) {
        int nq = num_qubits[i];
        int j = start > 0 ? offset_prefix_sum[start - 1][i] : 0;
        int needed_samples = offset_prefix_sum[start][i] - j;
        if (needed_samples <= 0) {
          continue;
        }

        if (nq > largest_nq) {
          largest_nq = nq;
          sv = ss.Create(largest_nq);
          scratch = ss.Create(largest_nq);
        }
        QTSimulator::Parameter param;
        param.collect_kop_stat = false;
        param.collect_mea_stat = true;
        param.normalize_before_mea_gates = true;

        // Track op-wise stats.
        std::vector<uint64_t> gathered_samples;
        int run_samples = 0;

        while (1) {
          ss.SetStateZero(sv);
          QTSimulator::RunOnce(param, ncircuits[i], rand_source.Rand64(), ss,
                               sim, scratch, sv, gathered_samples);

          uint64_t q_ind = 0;
          uint64_t mask = 1;
          bool val = 0;
          while (q_ind < nq) {
            val = gathered_samples[0] & mask;
            (*output_tensor)(
                i, j, static_cast<ptrdiff_t>(max_num_qubits - q_ind - 1)) = val;
            q_ind++;
            mask <<= 1;
          }
          while (q_ind < max_num_qubits) {
            (*output_tensor)(
                i, j, static_cast<ptrdiff_t>(max_num_qubits - q_ind - 1)) = -2;
            q_ind++;
          }

          j++;
          run_samples++;

          // Check if we have gathered enough samples.
          if (run_samples >= needed_samples) {
            break;
          }
        }
      }
    };

    // block_size = 1.
    tensorflow::thread::ThreadPool::SchedulingParams scheduling_params(
        tensorflow::thread::ThreadPool::SchedulingStrategy::kFixedBlockSize,
        absl::nullopt, 1);
    context->device()->tensorflow_cpu_worker_threads()->workers->ParallelFor(
        num_threads, scheduling_params, DoWork);
  }