int bytes_chunk_cache::truncate()

in turbonfs/src/file_cache.cpp [2048:2405]


int bytes_chunk_cache::truncate(uint64_t trunc_len,
                                bool post,
                                uint64_t& bytes_truncated)
{
    /*
     * Must be called only when inode is being truncated.
     * Then we are guaranteed that no new writes will be sent by fuse, which
     * means no simultaneous calls to set_uptodate() can be updating cache_size.
     */
    assert(!inode || inode->is_truncate_in_progress());

    // truncate(post=true) must be called with flush_lock held.
    assert(!post || inode->is_flushing);
    assert(trunc_len <= AZNFSC_MAX_FILE_SIZE);

    AZLogDebug("[{}] <Truncate {}> {}called [S: {}, C: {}, CS: {}], "
               "U: {}, A: {}, C: {}, T: {}, chunkmap.size(): {}",
               CACHE_TAG, trunc_len, post ? "POST " : "",
               inode->get_server_file_size(),
               inode->get_client_file_size(),
               inode->get_cached_filesize(),
               bytes_uptodate.load(),
               bytes_allocated.load(),
               bytes_cached.load(),
               bytes_truncate.load(),
               chunkmap.size());

    /*
     * Membufs which must have been deleted/trimmed, but skipped as they were
     * in use.
     */
    int mb_skipped = 0;

    bytes_truncated = 0;

    /*
     * Step #1: Grab chunkmap lock and mark all the affected bcs inuse, so that
     *          they aren't removed while we work on them in subsequent steps.
     */
    std::vector<std::map<uint64_t, struct bytes_chunk>::iterator> it_vec1;
    {
        // TODO: Make it shared lock.
        const std::unique_lock<std::mutex> _lock(chunkmap_lock_43);

        if (chunkmap.empty()) {
            return 0;
        }

        /*
         * Get chunk with starting offset >= trunc_len.
         * If prev bc has one or more bytes to be truncated, count that in too.
         */
        auto it = chunkmap.lower_bound(trunc_len);

        if (it != chunkmap.begin()) {
            auto prev_it = std::prev(it);
            const struct bytes_chunk *prev_bc = &(prev_it->second);

            assert(trunc_len > prev_bc->offset);

            if (trunc_len < (prev_bc->offset + prev_bc->length)) {
                // Prev bc has one or more bytes truncated.
                it = prev_it;
            }
        }

        /*
         * Save iterators to all the affected bcs in it_vec1.
         * Note that since we don't remove an inuse bc from the chunkmap, it's
         * safe to access these iterators after the chunkmap lock is released.
         */
        while (it != chunkmap.cend()) {
            const struct bytes_chunk& bc = it->second;
            struct membuf *mb = bc.get_membuf();

            // bc must have at least one byte truncated.
            assert((bc.offset + bc.length) > trunc_len);

            mb->set_inuse();
            it_vec1.emplace_back(it);

            ++it;
        }

        if (it_vec1.empty()) {
            return 0;
        }
    }

    /*
     * Step #2: Grab membuf lock for all the affected bcs.
     *          Once we have all the locks, we are guaranteed that no IOs are
     *          ongoing on any of the truncated bcs and no new IOs can be
     *          started.
     */
    std::vector<std::map<uint64_t, struct bytes_chunk>::iterator> it_vec3;
    for (auto &it : it_vec1) {
        const struct bytes_chunk& bc = it->second;
        struct membuf *mb = bc.get_membuf();

        assert(mb != nullptr);

        /*
         * We grabbed it above, and there could be some reader(s) that may
         * also have an inuse count.
         * We skip membufs with inuse > 1 as they are mostly being read and
         * hence set_locked() will unnecessarily take long. We would rather
         * simply skip them in this iteration and let the reader(s) continue.
         * Caller will call us again and eventually we will be able to make
         * progress.
         */
        assert(mb->get_inuse() >= 1);

        if (mb->get_inuse() > 1) {
            mb_skipped++;
            AZLogInfo("[{}] <Truncate {}> {}skipping inuse membuf "
                      "[{},{}), held by {} reader(s)",
                      CACHE_TAG, trunc_len, post ? "POST " : "",
                      bc.offset, bc.offset + bc.length,
                      mb->get_inuse() - 1);
            mb->clear_inuse();
            continue;
        }

        /*
         * For all bcs that we could get the membuf lock, add them to it_vec3.
         * These will be truncated (fully or partially).
         * If post is true we use try_lock() and skip if we do not get the lock.
         */
        if (post) {
            if (mb->try_lock()) {
                it_vec3.emplace_back(it);
            } else {
                mb->clear_inuse();
                mb_skipped++;
                AZLogInfo("[{}] <Truncate {}> POST failed to lock membuf "
                          "[{},{})", CACHE_TAG,
                          trunc_len, bc.offset, bc.offset + bc.length);
                /*
                 * There cannot be any writes issued by VFS while truncate is
                 * still not complete so we cannot have lock held by writers.
                 * Reads can be issued but since truncate() would have reduced
                 * the cache_size, we won't issue reads beyond the truncated
                 * size so reads also cannot hold the membuf lock.
                 * There is one small possibility that truncate falls between
                 * a membuf and that membuf is being read. It's a very small
                 * race so we leave the useful assert.
                 */
                assert(0);
            }
        } else {
            mb->set_locked();
            it_vec3.emplace_back(it);
        }
    }

    assert(it_vec3.size() <= it_vec1.size());

    if (it_vec3.empty()) {
        return mb_skipped;
    }

    /*
     * Step #3: Release all the affected bcs. If there's a partial bc, it'll be
     *          trimmed from the right.
     */
    {
        const std::unique_lock<std::mutex> _lock(chunkmap_lock_43);

        for (auto& it : it_vec3) {
            struct bytes_chunk& bc = it->second;
            struct membuf *mb = bc.get_membuf();

            // membuf and chunkmap bc offset and length must always be in sync.
            assert(bc.length == mb->length);
            assert(bc.offset == mb->offset);

            /*
             * it_vec3 should not have any bc that lies completely before
             * trunc_len.
             */
            assert((bc.offset + bc.length) > trunc_len);

            /*
             * VFS blocks writes while there's an ongoing truncate but it can
             * send read calls which can hold an inuse count on the membuf.
             * We skip such chunks and let the caller know that we couldn't
             * truncate all the chunks, caller will then sleep for some time
             * letting readers proceed and then try again.
             *
             * Note that we need to check inuse count again after acquiring
             * the chunkmap lock as some reader(s) may have grabbed an inuse
             * count before we had the lock.
             */
            assert(mb->get_inuse() >= 1);
            if (mb->get_inuse() > 1) {
                mb_skipped++;
                AZLogInfo("[{}] <<Truncate {}>> {}skipping inuse membuf "
                          "[{},{}), held by {} reader(s)",
                          CACHE_TAG, trunc_len, post ? "POST " : "",
                          bc.offset, bc.offset + bc.length,
                          mb->get_inuse() - 1);
                mb->clear_locked();
                mb->clear_inuse();
                continue;
            }

            // trunc_len falls inside this bc?
            if (bc.offset < trunc_len) {
                // How many bytes to trim from the right?
                const uint64_t trim_bytes = (bc.offset + bc.length - trunc_len);
                assert(trim_bytes > 0);

                AZLogDebug("[{}] <Truncate {}> {}trimming chunk from right "
                           "[{},{}) -> [{},{})", CACHE_TAG,
                           trunc_len, post ? "POST " : "",
                           bc.offset, bc.offset + bc.length,
                           bc.offset, trunc_len);

                // Trim chunkmap bc.
                bc.length -= trim_bytes;
                assert((int64_t) bc.length > 0);

                /*
                 * Trim membuf.
                 * trim() expects inuse to be dropped before calling.
                 * We have the membuf lock, so we are fine.
                 */
                mb->clear_inuse();
                mb->trim(trim_bytes, false /* left */);
                mb->set_inuse();

                assert(bytes_cached >= trim_bytes);
                assert(bytes_cached_g >= trim_bytes);
                bytes_cached -= trim_bytes;
                bytes_cached_g -= trim_bytes;

                bytes_truncated += trim_bytes;

                mb->clear_locked();
                mb->clear_inuse();
            } else {
                AZLogDebug("[{}] <Truncate {}> {}truncated full chunk [{},{}), "
                           "mb use_count: {}",
                           CACHE_TAG, trunc_len, post ? "POST " : "",
                           bc.offset, bc.offset + bc.length,
                           bc.get_membuf_usecount());

                /*
                 * Release the chunk.
                 * This will release the membuf (munmap() it in case of file-backed
                 * cache and delete it for heap backed cache).
                 */
                assert(num_chunks > 0);
                num_chunks--;
                assert(num_chunks_g > 0);
                num_chunks_g--;

                assert(bytes_cached >= bc.length);
                assert(bytes_cached_g >= bc.length);
                bytes_cached -= bc.length;
                bytes_cached_g -= bc.length;

                bytes_truncated += bc.length;

                mb->set_truncated();

                mb->clear_locked();
                mb->clear_inuse();

                /*
                 * membuf destructor will be called here unless read path
                 * gets a ref to this membuf. Note that writes won't be
                 * coming as VFS will serialize them with truncate.
                 * For post=true we don't have the flush_lock, so another way
                 * these membufs may have an extra ref count held is if
                 * flush_cache_and_wait() is trying to flush these membufs.
                 */
                chunkmap.erase(it);
            }
        }

        /*
         * Recalculate cache size.
         */
        bool cache_size_updated = false;

        for (auto it = chunkmap.rbegin(); it != chunkmap.rend(); ++it) {
            struct bytes_chunk *bc = &(it->second);
            struct membuf *mb = bc->get_membuf();
            if (mb->is_uptodate()) {
                assert(mb->length > 0);
                assert(cache_size >= (mb->offset + mb->length));
                /*
                 * cache_size is only reduced by truncate() and truncates
                 * are serialized by the VFS inode lock, so only one truncate
                 * can be ongoing, thus we are guaranteed that cache_size
                 * cannot be reduced. Also, since no new writes will be sent
                 * by fuse, no calls to set_uptodate() could be ongoing and
                 * hence cache_size won't be increased either.
                 */
                uint64_t expected = cache_size;
                [[maybe_unused]]
                const bool updated =
                    cache_size.compare_exchange_strong(expected, mb->offset + mb->length);
                assert(updated);
                assert(cache_size == (mb->offset + mb->length));
                assert(cache_size > 0);
                cache_size_updated = true;
                break;
            }
        }

        if (!cache_size_updated) {
            uint64_t expected = cache_size;
            [[maybe_unused]]
            const bool updated =
                cache_size.compare_exchange_strong(expected, 0);
            assert(updated);
            assert(cache_size == 0);
        }
    }

    assert(bytes_truncated <= (AZNFSC_MAX_FILE_SIZE-trunc_len));

    bytes_truncate += bytes_truncated;
    bytes_truncate_g += bytes_truncated;

    num_truncate++;
    num_truncate_g++;

    AZLogDebug("[{}] <Truncate {}> {}done, [S: {}, C: {}, CS: {}], "
               "cache_size: {}, U: {}, A: {}, C: {}, T: {}, "
               "chunkmap.size(): {}, bytes_truncated: {}, mb_skipped: {}",
               CACHE_TAG,
               trunc_len, post ? "POST " : "",
               inode->get_server_file_size(),
               inode->get_client_file_size(),
               inode->get_cached_filesize(),
               cache_size.load(),
               bytes_uptodate.load(),
               bytes_allocated.load(),
               bytes_cached.load(),
               bytes_truncate.load(),
               chunkmap.size(),
               bytes_truncated,
               mb_skipped);

    /*
     * See comment in get_cache_size().
     * Before reaching here we hit this assert in
     * get_cached_filesize()->get_cache_size(), but keep here for correctness
     */
    assert((cache_size >= bytes_uptodate) ||
           (bytes_uptodate > bytes_cached));

    return mb_skipped;
}