in turbonfs/src/fcsm.cpp [119:311]
void fcsm::run(struct rpc_task *task,
uint64_t extent_left,
uint64_t extent_right)
{
assert(task->magic == RPC_TASK_MAGIC);
assert(task->get_op_type() == FUSE_WRITE);
assert(task->rpc_api->write_task.is_fe());
assert(task->rpc_api->write_task.get_size() > 0);
assert(extent_right > extent_left);
const size_t length = task->rpc_api->write_task.get_size();
const off_t offset = task->rpc_api->write_task.get_offset();
const bool sparse_write = false;
const fuse_ino_t ino = task->rpc_api->write_task.get_ino();
/*
* fcsm::run() is called after fuse thread successfully copies user data
* into the cache. We can have the following cases (in decreasing order of
* criticality):
* 1. Cache has dirty+uncommitted data beyond the "inline write threshold",
* ref do_inline_write().
* In this case we begin flush of the dirty data and/or commit of the
* uncommitted data.
* This is a memory pressure situation and hence we do not complete the
* application write till all the above backend writes complete.
* This will happen when application is writing faster than our backend
* write throughput, eventually dirty data will grow beyond the "inline
* write threshold" and then we have to slow down the writers by delaying
* completion.
* 2. Cache has uncommitted data beyond the "commit threshold", ref
* commit_required().
* In this case we free up space in the cache by committing data.
* We just initiate a commit while the current write request is
* completed. Note that we want to delay commits so that we can reduce
* the commit calls (commit more data per call) as commit calls sort
* of serialize the writes as we cannot send any other write to the
* server while commit is on.
* 3. Cache has enough dirty data that we can flush.
* For sequential writes, this means the new write caused a contiguous
* extent to be greater than max_dirty_extent_bytes(), aka MDEB, while
* for non-seq writes it would mean that the total dirty data grew beyond
* MDEB.
* In this case we begin flush of this contiguous extent (in multiple
* parallel wsize sized blocks) since there's no benefit in waiting more
* as the data is sufficient for the server scheduler to effectively
* write, in optimal sized blocks.
* We complete the application write rightaway without waiting for the
* flush to complete as we are not under memory pressure.
*
* Other than this we have a special case of "write beyond eof" (termed
* sparse write). In sparse write case also we perform "inline write" of
* all the dirty data. This is needed for correct read behaviour. Imagine
* a reader reading from the sparse part of the file which is not yet in
* the bytes_chunk_cache. This read will be issued to the server and since
* server doesn't know the updated file size (as the write may still be
* sitting in our bytes_chunk_cache) it will return eof. This is not correct
* as such reads issued after successful write, are valid and should return
* 0 bytes for the sparse range.
*/
/*
* If the extent size exceeds the max allowed dirty size as returned by
* max_dirty_extent_bytes(), then it's time to flush the extent.
* Note that this will cause sequential writes to be flushed at just the
* right intervals to optimize fewer write calls and also allowing the
* server scheduler to merge better.
* See bytes_to_flush for how random writes are flushed.
*
* Note: max_dirty_extent is static as it doesn't change after it's
* queried for the first time.
*/
static const uint64_t max_dirty_extent =
inode->get_filecache()->max_dirty_extent_bytes();
assert(max_dirty_extent > 0);
/*
* Check what kind of limit we have hit.
*/
const bool need_inline_write =
(sparse_write || inode->get_filecache()->do_inline_write());
const bool need_commit =
!need_inline_write &&
inode->get_filecache()->commit_required();
const bool need_flush =
!need_inline_write &&
inode->get_filecache()->flush_required(extent_right - extent_left);
AZLogDebug("[{}] fcsm::run() (sparse={}, need_inline_write={}, "
"need_commit={}, need_flush={}, extent=[{}, {}))",
inode->get_fuse_ino(), sparse_write, need_inline_write,
need_commit, need_flush, extent_left, extent_right);
/*
* Nothing to do, we can complete the application write rightaway.
* This should be the happy path!
*/
if (!need_inline_write && !need_commit && !need_flush) {
INC_GBL_STATS(writes_np, 1);
task->reply_write(length);
return;
}
/*
* Do we need to perform "inline write"?
* Inline write implies, we flush all the dirty data and wait for all the
* corresponding backend writes to complete.
*/
if (need_inline_write) {
INC_GBL_STATS(inline_writes, 1);
AZLogDebug("[{}] Inline write (sparse={}), {} bytes, extent @ [{}, {})",
ino, sparse_write, (extent_right - extent_left),
extent_left, extent_right);
/*
* Queue a blocking flush/commit target, which will complete the fuse
* write after flush/commit completes.
* In case of stable writes we queue a flush target while in case of
* unstable writes we queue a commit target. Commit target implicitly
* performs flush before the commit.
*/
inode->flush_lock();
if (inode->is_stable_write()) {
inode->get_fcsm()->ensure_flush(offset, length, task);
} else {
inode->get_fcsm()->ensure_commit(offset, length, task);
}
inode->flush_unlock();
// Free up the fuse thread without completing the application write.
return;
}
// Case 2: Commit
/*
* We don't need to do inline writes. See if we need to commit the
* uncommitted data to the backend. We just need to stat the commit
* and not hold the current write task till the commit completes.
*/
if (need_commit) {
assert(!inode->is_stable_write());
inode->flush_lock();
/*
* Commit will only start after all ongoing flush complete and no new
* flush can start (as we have the flush_lock). This means no new
* commit-pending data can be added and hence the current inprogress
* commit will finish committing all commit-pending bytes.
*/
if (inode->is_commit_in_progress()) {
assert(!inode->get_filecache()->is_flushing_in_progress());
AZLogDebug("[{}] Commit already in progress, skipping commit",
ino);
} else {
AZLogDebug("[{}] Committing {} bytes", ino,
inode->get_filecache()->max_commit_bytes());
inode->get_fcsm()->ensure_commit(offset, length, nullptr);
}
inode->flush_unlock();
}
/*
* Ok, we don't need to do inline writes. See if we have enough dirty
* data and we need to start async flush.
*/
// Case 3: Flush
if ((extent_right - extent_left) < max_dirty_extent) {
/*
* This is the case of non-sequential writes causing enough dirty
* data to be accumulated, need to flush all of that.
*/
extent_left = 0;
extent_right = UINT64_MAX;
}
/*
* Queue a non-blocking flush target for flushing *all* the dirty data.
*/
if (need_flush) {
inode->flush_lock();
inode->get_fcsm()->ensure_flush(offset, length, nullptr);
inode->flush_unlock();
}
/*
* Complete the write request without waiting for the backend flush/commit
* to complete. For need_inline_write we should not complete the task now.
*/
assert(!need_inline_write);
task->reply_write(length);
}