void fcsm::run()

in turbonfs/src/fcsm.cpp [119:311]


void fcsm::run(struct rpc_task *task,
               uint64_t extent_left,
               uint64_t extent_right)
{
    assert(task->magic == RPC_TASK_MAGIC);
    assert(task->get_op_type() == FUSE_WRITE);
    assert(task->rpc_api->write_task.is_fe());
    assert(task->rpc_api->write_task.get_size() > 0);
    assert(extent_right > extent_left);

    const size_t length = task->rpc_api->write_task.get_size();
    const off_t offset = task->rpc_api->write_task.get_offset();
    const bool sparse_write = false;
    const fuse_ino_t ino = task->rpc_api->write_task.get_ino();

    /*
     * fcsm::run() is called after fuse thread successfully copies user data
     * into the cache. We can have the following cases (in decreasing order of
     * criticality):
     * 1. Cache has dirty+uncommitted data beyond the "inline write threshold",
     *    ref do_inline_write().
     *    In this case we begin flush of the dirty data and/or commit of the
     *    uncommitted data.
     *    This is a memory pressure situation and hence we do not complete the
     *    application write till all the above backend writes complete.
     *    This will happen when application is writing faster than our backend
     *    write throughput, eventually dirty data will grow beyond the "inline
     *    write threshold" and then we have to slow down the writers by delaying
     *    completion.
     * 2. Cache has uncommitted data beyond the "commit threshold", ref
     *    commit_required().
     *    In this case we free up space in the cache by committing data.
     *    We just initiate a commit while the current write request is
     *    completed. Note that we want to delay commits so that we can reduce
     *    the commit calls (commit more data per call) as commit calls sort
     *    of serialize the writes as we cannot send any other write to the
     *    server while commit is on.
     * 3. Cache has enough dirty data that we can flush.
     *    For sequential writes, this means the new write caused a contiguous
     *    extent to be greater than max_dirty_extent_bytes(), aka MDEB, while
     *    for non-seq writes it would mean that the total dirty data grew beyond
     *    MDEB.
     *    In this case we begin flush of this contiguous extent (in multiple
     *    parallel wsize sized blocks) since there's no benefit in waiting more
     *    as the data is sufficient for the server scheduler to effectively
     *    write, in optimal sized blocks.
     *    We complete the application write rightaway without waiting for the
     *    flush to complete as we are not under memory pressure.
     *
     * Other than this we have a special case of "write beyond eof" (termed
     * sparse write). In sparse write case also we perform "inline write" of
     * all the dirty data. This is needed for correct read behaviour. Imagine
     * a reader reading from the sparse part of the file which is not yet in
     * the bytes_chunk_cache. This read will be issued to the server and since
     * server doesn't know the updated file size (as the write may still be
     * sitting in our bytes_chunk_cache) it will return eof. This is not correct
     * as such reads issued after successful write, are valid and should return
     * 0 bytes for the sparse range.
     */

    /*
     * If the extent size exceeds the max allowed dirty size as returned by
     * max_dirty_extent_bytes(), then it's time to flush the extent.
     * Note that this will cause sequential writes to be flushed at just the
     * right intervals to optimize fewer write calls and also allowing the
     * server scheduler to merge better.
     * See bytes_to_flush for how random writes are flushed.
     *
     * Note: max_dirty_extent is static as it doesn't change after it's
     *       queried for the first time.
     */
    static const uint64_t max_dirty_extent =
        inode->get_filecache()->max_dirty_extent_bytes();
    assert(max_dirty_extent > 0);

    /*
     * Check what kind of limit we have hit.
     */
    const bool need_inline_write =
        (sparse_write || inode->get_filecache()->do_inline_write());
    const bool need_commit =
        !need_inline_write &&
        inode->get_filecache()->commit_required();
    const bool need_flush =
        !need_inline_write &&
        inode->get_filecache()->flush_required(extent_right - extent_left);

    AZLogDebug("[{}] fcsm::run() (sparse={}, need_inline_write={}, "
               "need_commit={}, need_flush={}, extent=[{}, {}))",
               inode->get_fuse_ino(), sparse_write, need_inline_write,
               need_commit, need_flush, extent_left, extent_right);

    /*
     * Nothing to do, we can complete the application write rightaway.
     * This should be the happy path!
     */
    if (!need_inline_write && !need_commit && !need_flush) {
        INC_GBL_STATS(writes_np, 1);
        task->reply_write(length);
        return;
    }

    /*
     * Do we need to perform "inline write"?
     * Inline write implies, we flush all the dirty data and wait for all the
     * corresponding backend writes to complete.
     */
    if (need_inline_write) {
        INC_GBL_STATS(inline_writes, 1);

        AZLogDebug("[{}] Inline write (sparse={}), {} bytes, extent @ [{}, {})",
                   ino, sparse_write, (extent_right - extent_left),
                   extent_left, extent_right);

        /*
         * Queue a blocking flush/commit target, which will complete the fuse
         * write after flush/commit completes.
         * In case of stable writes we queue a flush target while in case of
         * unstable writes we queue a commit target. Commit target implicitly
         * performs flush before the commit.
         */
        inode->flush_lock();
        if (inode->is_stable_write()) {
            inode->get_fcsm()->ensure_flush(offset, length, task);
        } else {
            inode->get_fcsm()->ensure_commit(offset, length, task);
        }
        inode->flush_unlock();

        // Free up the fuse thread without completing the application write.
        return;
    }

    // Case 2: Commit
    /*
     * We don't need to do inline writes. See if we need to commit the
     * uncommitted data to the backend. We just need to stat the commit
     * and not hold the current write task till the commit completes.
     */
    if (need_commit) {
        assert(!inode->is_stable_write());

        inode->flush_lock();
        /*
         * Commit will only start after all ongoing flush complete and no new
         * flush can start (as we have the flush_lock). This means no new
         * commit-pending data can be added and hence the current inprogress
         * commit will finish committing all commit-pending bytes.
         */
        if (inode->is_commit_in_progress()) {
            assert(!inode->get_filecache()->is_flushing_in_progress());

            AZLogDebug("[{}] Commit already in progress, skipping commit",
                       ino);
        } else {
            AZLogDebug("[{}] Committing {} bytes", ino,
                       inode->get_filecache()->max_commit_bytes());
            inode->get_fcsm()->ensure_commit(offset, length, nullptr);
        }
        inode->flush_unlock();
    }

    /*
     * Ok, we don't need to do inline writes. See if we have enough dirty
     * data and we need to start async flush.
     */

    // Case 3: Flush
    if ((extent_right - extent_left) < max_dirty_extent) {
        /*
         * This is the case of non-sequential writes causing enough dirty
         * data to be accumulated, need to flush all of that.
         */
        extent_left = 0;
        extent_right = UINT64_MAX;
    }

    /*
     * Queue a non-blocking flush target for flushing *all* the dirty data.
     */
    if (need_flush) {
        inode->flush_lock();
        inode->get_fcsm()->ensure_flush(offset, length, nullptr);
        inode->flush_unlock();
    }

    /*
     * Complete the write request without waiting for the backend flush/commit
     * to complete. For need_inline_write we should not complete the task now.
     */
    assert(!need_inline_write);
    task->reply_write(length);
}