in turbonfs/src/file_cache.cpp [2048:2405]
int bytes_chunk_cache::truncate(uint64_t trunc_len,
bool post,
uint64_t& bytes_truncated)
{
/*
* Must be called only when inode is being truncated.
* Then we are guaranteed that no new writes will be sent by fuse, which
* means no simultaneous calls to set_uptodate() can be updating cache_size.
*/
assert(!inode || inode->is_truncate_in_progress());
// truncate(post=true) must be called with flush_lock held.
assert(!post || inode->is_flushing);
assert(trunc_len <= AZNFSC_MAX_FILE_SIZE);
AZLogDebug("[{}] <Truncate {}> {}called [S: {}, C: {}, CS: {}], "
"U: {}, A: {}, C: {}, T: {}, chunkmap.size(): {}",
CACHE_TAG, trunc_len, post ? "POST " : "",
inode->get_server_file_size(),
inode->get_client_file_size(),
inode->get_cached_filesize(),
bytes_uptodate.load(),
bytes_allocated.load(),
bytes_cached.load(),
bytes_truncate.load(),
chunkmap.size());
/*
* Membufs which must have been deleted/trimmed, but skipped as they were
* in use.
*/
int mb_skipped = 0;
bytes_truncated = 0;
/*
* Step #1: Grab chunkmap lock and mark all the affected bcs inuse, so that
* they aren't removed while we work on them in subsequent steps.
*/
std::vector<std::map<uint64_t, struct bytes_chunk>::iterator> it_vec1;
{
// TODO: Make it shared lock.
const std::unique_lock<std::mutex> _lock(chunkmap_lock_43);
if (chunkmap.empty()) {
return 0;
}
/*
* Get chunk with starting offset >= trunc_len.
* If prev bc has one or more bytes to be truncated, count that in too.
*/
auto it = chunkmap.lower_bound(trunc_len);
if (it != chunkmap.begin()) {
auto prev_it = std::prev(it);
const struct bytes_chunk *prev_bc = &(prev_it->second);
assert(trunc_len > prev_bc->offset);
if (trunc_len < (prev_bc->offset + prev_bc->length)) {
// Prev bc has one or more bytes truncated.
it = prev_it;
}
}
/*
* Save iterators to all the affected bcs in it_vec1.
* Note that since we don't remove an inuse bc from the chunkmap, it's
* safe to access these iterators after the chunkmap lock is released.
*/
while (it != chunkmap.cend()) {
const struct bytes_chunk& bc = it->second;
struct membuf *mb = bc.get_membuf();
// bc must have at least one byte truncated.
assert((bc.offset + bc.length) > trunc_len);
mb->set_inuse();
it_vec1.emplace_back(it);
++it;
}
if (it_vec1.empty()) {
return 0;
}
}
/*
* Step #2: Grab membuf lock for all the affected bcs.
* Once we have all the locks, we are guaranteed that no IOs are
* ongoing on any of the truncated bcs and no new IOs can be
* started.
*/
std::vector<std::map<uint64_t, struct bytes_chunk>::iterator> it_vec3;
for (auto &it : it_vec1) {
const struct bytes_chunk& bc = it->second;
struct membuf *mb = bc.get_membuf();
assert(mb != nullptr);
/*
* We grabbed it above, and there could be some reader(s) that may
* also have an inuse count.
* We skip membufs with inuse > 1 as they are mostly being read and
* hence set_locked() will unnecessarily take long. We would rather
* simply skip them in this iteration and let the reader(s) continue.
* Caller will call us again and eventually we will be able to make
* progress.
*/
assert(mb->get_inuse() >= 1);
if (mb->get_inuse() > 1) {
mb_skipped++;
AZLogInfo("[{}] <Truncate {}> {}skipping inuse membuf "
"[{},{}), held by {} reader(s)",
CACHE_TAG, trunc_len, post ? "POST " : "",
bc.offset, bc.offset + bc.length,
mb->get_inuse() - 1);
mb->clear_inuse();
continue;
}
/*
* For all bcs that we could get the membuf lock, add them to it_vec3.
* These will be truncated (fully or partially).
* If post is true we use try_lock() and skip if we do not get the lock.
*/
if (post) {
if (mb->try_lock()) {
it_vec3.emplace_back(it);
} else {
mb->clear_inuse();
mb_skipped++;
AZLogInfo("[{}] <Truncate {}> POST failed to lock membuf "
"[{},{})", CACHE_TAG,
trunc_len, bc.offset, bc.offset + bc.length);
/*
* There cannot be any writes issued by VFS while truncate is
* still not complete so we cannot have lock held by writers.
* Reads can be issued but since truncate() would have reduced
* the cache_size, we won't issue reads beyond the truncated
* size so reads also cannot hold the membuf lock.
* There is one small possibility that truncate falls between
* a membuf and that membuf is being read. It's a very small
* race so we leave the useful assert.
*/
assert(0);
}
} else {
mb->set_locked();
it_vec3.emplace_back(it);
}
}
assert(it_vec3.size() <= it_vec1.size());
if (it_vec3.empty()) {
return mb_skipped;
}
/*
* Step #3: Release all the affected bcs. If there's a partial bc, it'll be
* trimmed from the right.
*/
{
const std::unique_lock<std::mutex> _lock(chunkmap_lock_43);
for (auto& it : it_vec3) {
struct bytes_chunk& bc = it->second;
struct membuf *mb = bc.get_membuf();
// membuf and chunkmap bc offset and length must always be in sync.
assert(bc.length == mb->length);
assert(bc.offset == mb->offset);
/*
* it_vec3 should not have any bc that lies completely before
* trunc_len.
*/
assert((bc.offset + bc.length) > trunc_len);
/*
* VFS blocks writes while there's an ongoing truncate but it can
* send read calls which can hold an inuse count on the membuf.
* We skip such chunks and let the caller know that we couldn't
* truncate all the chunks, caller will then sleep for some time
* letting readers proceed and then try again.
*
* Note that we need to check inuse count again after acquiring
* the chunkmap lock as some reader(s) may have grabbed an inuse
* count before we had the lock.
*/
assert(mb->get_inuse() >= 1);
if (mb->get_inuse() > 1) {
mb_skipped++;
AZLogInfo("[{}] <<Truncate {}>> {}skipping inuse membuf "
"[{},{}), held by {} reader(s)",
CACHE_TAG, trunc_len, post ? "POST " : "",
bc.offset, bc.offset + bc.length,
mb->get_inuse() - 1);
mb->clear_locked();
mb->clear_inuse();
continue;
}
// trunc_len falls inside this bc?
if (bc.offset < trunc_len) {
// How many bytes to trim from the right?
const uint64_t trim_bytes = (bc.offset + bc.length - trunc_len);
assert(trim_bytes > 0);
AZLogDebug("[{}] <Truncate {}> {}trimming chunk from right "
"[{},{}) -> [{},{})", CACHE_TAG,
trunc_len, post ? "POST " : "",
bc.offset, bc.offset + bc.length,
bc.offset, trunc_len);
// Trim chunkmap bc.
bc.length -= trim_bytes;
assert((int64_t) bc.length > 0);
/*
* Trim membuf.
* trim() expects inuse to be dropped before calling.
* We have the membuf lock, so we are fine.
*/
mb->clear_inuse();
mb->trim(trim_bytes, false /* left */);
mb->set_inuse();
assert(bytes_cached >= trim_bytes);
assert(bytes_cached_g >= trim_bytes);
bytes_cached -= trim_bytes;
bytes_cached_g -= trim_bytes;
bytes_truncated += trim_bytes;
mb->clear_locked();
mb->clear_inuse();
} else {
AZLogDebug("[{}] <Truncate {}> {}truncated full chunk [{},{}), "
"mb use_count: {}",
CACHE_TAG, trunc_len, post ? "POST " : "",
bc.offset, bc.offset + bc.length,
bc.get_membuf_usecount());
/*
* Release the chunk.
* This will release the membuf (munmap() it in case of file-backed
* cache and delete it for heap backed cache).
*/
assert(num_chunks > 0);
num_chunks--;
assert(num_chunks_g > 0);
num_chunks_g--;
assert(bytes_cached >= bc.length);
assert(bytes_cached_g >= bc.length);
bytes_cached -= bc.length;
bytes_cached_g -= bc.length;
bytes_truncated += bc.length;
mb->set_truncated();
mb->clear_locked();
mb->clear_inuse();
/*
* membuf destructor will be called here unless read path
* gets a ref to this membuf. Note that writes won't be
* coming as VFS will serialize them with truncate.
* For post=true we don't have the flush_lock, so another way
* these membufs may have an extra ref count held is if
* flush_cache_and_wait() is trying to flush these membufs.
*/
chunkmap.erase(it);
}
}
/*
* Recalculate cache size.
*/
bool cache_size_updated = false;
for (auto it = chunkmap.rbegin(); it != chunkmap.rend(); ++it) {
struct bytes_chunk *bc = &(it->second);
struct membuf *mb = bc->get_membuf();
if (mb->is_uptodate()) {
assert(mb->length > 0);
assert(cache_size >= (mb->offset + mb->length));
/*
* cache_size is only reduced by truncate() and truncates
* are serialized by the VFS inode lock, so only one truncate
* can be ongoing, thus we are guaranteed that cache_size
* cannot be reduced. Also, since no new writes will be sent
* by fuse, no calls to set_uptodate() could be ongoing and
* hence cache_size won't be increased either.
*/
uint64_t expected = cache_size;
[[maybe_unused]]
const bool updated =
cache_size.compare_exchange_strong(expected, mb->offset + mb->length);
assert(updated);
assert(cache_size == (mb->offset + mb->length));
assert(cache_size > 0);
cache_size_updated = true;
break;
}
}
if (!cache_size_updated) {
uint64_t expected = cache_size;
[[maybe_unused]]
const bool updated =
cache_size.compare_exchange_strong(expected, 0);
assert(updated);
assert(cache_size == 0);
}
}
assert(bytes_truncated <= (AZNFSC_MAX_FILE_SIZE-trunc_len));
bytes_truncate += bytes_truncated;
bytes_truncate_g += bytes_truncated;
num_truncate++;
num_truncate_g++;
AZLogDebug("[{}] <Truncate {}> {}done, [S: {}, C: {}, CS: {}], "
"cache_size: {}, U: {}, A: {}, C: {}, T: {}, "
"chunkmap.size(): {}, bytes_truncated: {}, mb_skipped: {}",
CACHE_TAG,
trunc_len, post ? "POST " : "",
inode->get_server_file_size(),
inode->get_client_file_size(),
inode->get_cached_filesize(),
cache_size.load(),
bytes_uptodate.load(),
bytes_allocated.load(),
bytes_cached.load(),
bytes_truncate.load(),
chunkmap.size(),
bytes_truncated,
mb_skipped);
/*
* See comment in get_cache_size().
* Before reaching here we hit this assert in
* get_cached_filesize()->get_cache_size(), but keep here for correctness
*/
assert((cache_size >= bytes_uptodate) ||
(bytes_uptodate > bytes_cached));
return mb_skipped;
}