void PCAMatrix::train()

in faiss/VectorTransform.cpp [434:598]


void PCAMatrix::train(Index::idx_t n, const float* x) {
    const float* x_in = x;

    x = fvecs_maybe_subsample(
            d_in, (size_t*)&n, max_points_per_d * d_in, x, verbose);

    ScopeDeleter<float> del_x(x != x_in ? x : nullptr);

    // compute mean
    mean.clear();
    mean.resize(d_in, 0.0);
    if (have_bias) { // we may want to skip the bias
        const float* xi = x;
        for (int i = 0; i < n; i++) {
            for (int j = 0; j < d_in; j++)
                mean[j] += *xi++;
        }
        for (int j = 0; j < d_in; j++)
            mean[j] /= n;
    }
    if (verbose) {
        printf("mean=[");
        for (int j = 0; j < d_in; j++)
            printf("%g ", mean[j]);
        printf("]\n");
    }

    if (n >= d_in) {
        // compute covariance matrix, store it in PCA matrix
        PCAMat.resize(d_in * d_in);
        float* cov = PCAMat.data();
        { // initialize with  mean * mean^T term
            float* ci = cov;
            for (int i = 0; i < d_in; i++) {
                for (int j = 0; j < d_in; j++)
                    *ci++ = -n * mean[i] * mean[j];
            }
        }
        {
            FINTEGER di = d_in, ni = n;
            float one = 1.0;
            ssyrk_("Up",
                   "Non transposed",
                   &di,
                   &ni,
                   &one,
                   (float*)x,
                   &di,
                   &one,
                   cov,
                   &di);
        }
        if (verbose && d_in <= 10) {
            float* ci = cov;
            printf("cov=\n");
            for (int i = 0; i < d_in; i++) {
                for (int j = 0; j < d_in; j++)
                    printf("%10g ", *ci++);
                printf("\n");
            }
        }

        std::vector<double> covd(d_in * d_in);
        for (size_t i = 0; i < d_in * d_in; i++)
            covd[i] = cov[i];

        std::vector<double> eigenvaluesd(d_in);

        eig(d_in, covd.data(), eigenvaluesd.data(), verbose);

        for (size_t i = 0; i < d_in * d_in; i++)
            PCAMat[i] = covd[i];
        eigenvalues.resize(d_in);

        for (size_t i = 0; i < d_in; i++)
            eigenvalues[i] = eigenvaluesd[i];

    } else {
        std::vector<float> xc(n * d_in);

        for (size_t i = 0; i < n; i++)
            for (size_t j = 0; j < d_in; j++)
                xc[i * d_in + j] = x[i * d_in + j] - mean[j];

        // compute Gram matrix
        std::vector<float> gram(n * n);
        {
            FINTEGER di = d_in, ni = n;
            float one = 1.0, zero = 0.0;
            ssyrk_("Up",
                   "Transposed",
                   &ni,
                   &di,
                   &one,
                   xc.data(),
                   &di,
                   &zero,
                   gram.data(),
                   &ni);
        }

        if (verbose && d_in <= 10) {
            float* ci = gram.data();
            printf("gram=\n");
            for (int i = 0; i < n; i++) {
                for (int j = 0; j < n; j++)
                    printf("%10g ", *ci++);
                printf("\n");
            }
        }

        std::vector<double> gramd(n * n);
        for (size_t i = 0; i < n * n; i++)
            gramd[i] = gram[i];

        std::vector<double> eigenvaluesd(n);

        // eig will fill in only the n first eigenvals

        eig(n, gramd.data(), eigenvaluesd.data(), verbose);

        PCAMat.resize(d_in * n);

        for (size_t i = 0; i < n * n; i++)
            gram[i] = gramd[i];

        eigenvalues.resize(d_in);
        // fill in only the n first ones
        for (size_t i = 0; i < n; i++)
            eigenvalues[i] = eigenvaluesd[i];

        { // compute PCAMat = x' * v
            FINTEGER di = d_in, ni = n;
            float one = 1.0;

            sgemm_("Non",
                   "Non Trans",
                   &di,
                   &ni,
                   &ni,
                   &one,
                   xc.data(),
                   &di,
                   gram.data(),
                   &ni,
                   &one,
                   PCAMat.data(),
                   &di);
        }

        if (verbose && d_in <= 10) {
            float* ci = PCAMat.data();
            printf("PCAMat=\n");
            for (int i = 0; i < n; i++) {
                for (int j = 0; j < d_in; j++)
                    printf("%10g ", *ci++);
                printf("\n");
            }
        }
        fvec_renorm_L2(d_in, n, PCAMat.data());
    }

    prepare_Ab();
    is_trained = true;
}