void nfs_client::periodic_updater()

in turbonfs/src/nfs_client.cpp [217:410]


void nfs_client::periodic_updater()
{
    // Maximum cache size allowed in bytes.
    static const uint64_t max_cache =
        (aznfsc_cfg.cache.data.user.max_size_mb * 1024 * 1024ULL);
    assert(max_cache != 0);

    /*
     * #1 Calculate recent read/write throughput to the server.
     */
    static std::atomic<time_t> last_sec;
    static std::atomic<uint64_t> last_server_bytes_written;
    static std::atomic<uint64_t> last_server_bytes_read;
    static std::atomic<uint64_t> last_genid;
    const time_t now_sec = ::time(NULL);
    const int sample_intvl = 5;

    assert(GET_GBL_STATS(server_bytes_written) >= last_server_bytes_written);
    assert(GET_GBL_STATS(server_bytes_read) >= last_server_bytes_read);
    assert(now_sec >= last_sec);

    /*
     * Every sample_intvl, compute read/write throughput for the last
     * interval. Only one thread should update the throughput.
     */
    const int intvl = now_sec - last_sec;
    if (intvl >= sample_intvl) {
        uint64_t expected = last_genid.load();
        if (rw_genid.compare_exchange_strong(expected, expected + 1)) {
            w_MBps = (GET_GBL_STATS(server_bytes_written) - last_server_bytes_written) /
                     (intvl * 1000'000);
            r_MBps = (GET_GBL_STATS(server_bytes_read) - last_server_bytes_read) /
                     (intvl * 1000'000);

            last_sec = now_sec;
            last_server_bytes_read = GET_GBL_STATS(server_bytes_read);
            last_server_bytes_written = GET_GBL_STATS(server_bytes_written);
            last_genid = rw_genid.load();
        }
    }

    if (max_cache == 0) {
        return;
    }

    /*
     * #2 Update scaling factor for readahead and flush/commit.
     *    The way we decide optimal values of these scaling factors is by
     *    letting each of these grow to take up more space as soon as its
     *    share of the cache grows and reduce the scaling to take up less
     *    space as its share grows. These competing forces result in both
     *    reaching equilibrium.
     */

    /*
     * cache  => total cache allocated, read + write.
     * wcache => cache allocated by writers.
     * rcache => cache allocated by reades.
     *
     * Note: We use dirty and commit_pending to indicate cache used by writers.
     *       This assumes that cache is released immediately after write or
     *       commit completes, else we will account for write cache as read.
     */
    const uint64_t cache = bytes_chunk_cache::bytes_allocated_g;
    const uint64_t wcache = bytes_chunk_cache::bytes_dirty_g +
                            bytes_chunk_cache::bytes_commit_pending_g;
    const uint64_t rcache = (uint64_t) std::max((int64_t)(cache - wcache), 0L);

    /*
     * Read cache usage as percent of max_cache.
     * This tells how aggressively readers are filling up the cache by
     * readahead. We increase readahead if this is low and reduce readahead
     * as this grows.
     */
    const double pct_rcache = (rcache * 100.0) / max_cache;

    /*
     * Write cache usage as percent of max_cache.
     * This tells how aggressively writees are filling up the cache by adding
     * dirty data. We reduce max_dirty_extent_bytes() if this grows so that
     * dirty data is flushed faster, and increase max_dirty_extent_bytes() if
     * this is low and we want to accumulate more dirty data and flush in bigger
     * chunks, for fast backend write throughput.
     */
    const double pct_wcache = (wcache * 100.0) / max_cache;

    /*
     * Total cache usage (read + write) as percent of max_cache.
     * Reads and writes are controlled individually as per pct_rcache and
     * pct_wcache, but in the end total cache usage is consulted and the
     * read and write scale factors are reduced accordingly.
     */
    const double pct_cache = (cache * 100.0) / max_cache;

    assert(pct_cache <= 100.0);
    assert((pct_rcache + pct_wcache) <= pct_cache);

    double rscale = 1.0;
    double wscale = 1.0;

    /*
     * Stop readahead completely if we are beyond the max cache size, o/w scale
     * it down proportionately to keep the cache size less than max cache limit.
     * We also scale up the readahead to make better utilization of the allowed
     * cache size, when there are fewer reads they are allowed to use more of
     * the cache per user for readahead.
     *
     * Just like read, write also tries to mop up cache space. Those two apply
     * opposing forces finally reaching an equilibrium.
     */
    if (pct_rcache >= 100) {
        /*
         * reads taking up all the cache space, completely stop readaheads.
         * This will cause read cache utilization to drop and then we will
         * increase readaheads, finally it'll settle on an optimal value.
         */
        rscale = 0;
    } else if (pct_rcache > 95) {
        rscale = 0.5;
    } else if (pct_rcache > 90) {
        rscale = 0.7;
    } else if (pct_rcache > 80) {
        rscale = 0.8;
    } else if (pct_rcache > 70) {
        rscale = 0.9;
    } else if (pct_rcache < 3) {
        rscale = 32;
    } else if (pct_rcache < 5) {
        rscale = 24;
    } else if (pct_rcache < 10) {
        rscale = 12;
    } else if (pct_rcache < 20) {
        rscale = 6;
    } else if (pct_rcache < 30) {
        rscale = 4;
    } else if (pct_rcache < 50) {
        rscale = 2.5;
    }

    if (pct_wcache > 95) {
        /*
         * Every file has fundamental right to 100MB of cache space.
         * If we reduce it further we will end up in sub-optimal writes
         * to the server.
         */
        wscale = 1.0/10;
    } else if (pct_wcache > 90) {
        // 200MB
        wscale = 2.0/10;
    } else if (pct_wcache > 80) {
        // 300MB
        wscale = 3.0/10;
    } else if (pct_wcache > 70) {
        // 400MB
        wscale = 4.0/10;
    } else if (pct_wcache > 60) {
        // 600MB
        wscale = 6.0/10;
    } else if (pct_wcache > 50) {
        // 700MB
        wscale = 7.0/10;
    }

    static uint64_t last_log_sec;
    bool log_now = false;

    // Don't log more frequently than 5 secs.
    if ((now_sec - last_log_sec) >= 5) {
        log_now = true;
        last_log_sec = now_sec;
    }

    if (fc_scale_factor != wscale) {
        if (log_now) {
            AZLogInfo("[FC] Scale factor updated ({} -> {}), "
                      "cache util [R: {:0.2f}%, W: {:0.2f}%, T: {:0.2f}%]",
                      fc_scale_factor.load(), wscale,
                      pct_rcache, pct_wcache, pct_cache);
        }
        fc_scale_factor = wscale;
        assert(fc_scale_factor >= 1.0/10);
    }

    if (ra_scale_factor != rscale) {
        if (log_now) {
            AZLogInfo("[RA] Scale factor updated ({} -> {}), "
                      "cache util [R: {:0.2f}%, W: {:0.2f}%, T: {:0.2f}%]",
                      ra_scale_factor.load(), rscale,
                      pct_rcache, pct_wcache, pct_cache);
        }
        ra_scale_factor = rscale;
        assert(ra_scale_factor >= 0);
    }
}