turbonfs/inc/file

#ifndef __AZNFSC_FILE_CACHE_H__ #define __AZNFSC_FILE_CACHE_H__ #include <map> #include <mutex> #include <memory> #include <vector> #include <list> #include <atomic> #include <chrono> #include <cstring> #include <cstdint> #include <cassert> #include <unistd.h> #include "aznfsc.h" #include "rpc_stats.h" #include "fcsm.h" struct nfs_inode; struct rpc_task; /* * Reminder to audit use of asserts to ensure we don't depend on assert * for error handling. * * XXX : This is temporarily disabled to support Release builds, but we need * to audit this before releasing. */ #if 0 #ifdef NDEBUG #error "Need to audit use of asserts in file_cache" #endif #endif /* * Uncomment this if you want to use the tailroom from last chunk for * new get() requests asking for data right after the last chunk. * f.e., let's say application made following sequence of get() requests: * 1. get(0, 131072) - due to 128K read request. * 2. Now it reads from the backend and backend returned eof after say * 10 bytes. Caller will call release(10, 131062) to trim the last * chunk for correctly representing the file size. This will not free * the membuf but just reduce the bytes_chunk's length. * 3. Now caller makes the call get(10, 131062). * * With UTILIZE_TAILROOM_FROM_LAST_MEMBUF defined, the bytes_chunk that * we return will still point to the existing membuf. Without the define * this new bytes_chunk will get its own membuf of 131062 bytes. * * Though it saves space but it complicates things wrt setting of uptodate * MB_Flag. Since the uptodate flag is a property of the membuf the 2nd * caller who gets [10, 1310272) should not treat it as uptodate as it's * not (only first 10 bytes are uptodate). * Since this will not happen much in practice, for now keep it disabled * so that we don't have to worry about this complexity. * * IMPORTANT: With this is_whole will be set to false for all bytes_chunks * returned for this range. This makes the cache less effective * in most situations and unusable in some situations. * If we decide to enable this we need to also make changes to * merge the two bytes_chunks into a single bytes_chunk covering * the entire membuf as soon as all existing users drop their * inuse count and lock. */ //#define UTILIZE_TAILROOM_FROM_LAST_MEMBUF namespace aznfsc { /* * This is the maximum chunk size we allow. This is like our page size, but * unlike the usual page cache where every page is fixed size, our chunk cache * may have chunks of different sizes, though for the perfect case where * applications are doing sequential reads/writes all/most chunks would have * the max size. Infact, we want large chunks to reduce maintenance overhead. * Currently fuse kernel driver never sends any read/write IO larger than 1MB, * so that will end up being the chunk size, but for background IOs that we * initiate (readahead IOs) we will use the max chunk size. * A chunk need not map 1:1 to an NFS READ/WRITE RPC, though typically we will * issue one NFS RPC for one chunk but we can have 1:m or m:1 mappings where * multiple chunks are populated by one NFS RPC and vice versa. * * See comment above membuf::flag. */ #define AZNFSC_MAX_CHUNK_SIZE (4ULL * 1024 * 1024) #define AZNFSC_BAD_OFFSET (~0ull) #define PAGE_SIZE (4096ULL) /* * Cache tag used for logging. * When inode is present, use inode number else address of bytes_chunk_cache * expressed as a 64-bit integer. */ #define CACHE_TAG (inode ? inode->get_fuse_ino() : (uint64_t) this) // Forward declaration. class bytes_chunk_cache; /* * Note on membuf flags Dirty/Flushing/CommitPending * ================================================= * When application data is copied to a membuf it becomes Dirty. * Dirty membufs are flushed to the server either when dirty data exceeds some * threshold or when the file is closed. While membuf is being flushed Flushing * bit is set. Once the membuf is successfully flushed, Dirty and Flushing bits * are cleared. If the flush was done using Unstable Write then CommitPending * bit is set. Once the membuf is successfully committed to the server, * CommitPending bit is cleared. * Till any of these bits is set, the membuf contains data which may not yet be * saved on the server and hence releasing the membuf may cause data loss. */ namespace MB_Flag { enum : uint32_t { Uptodate = (1 << 0), // Fit for reading. Locked = (1 << 1), // Exclusive access for updating membuf // data. Truncated = (1 << 2), // Entire byte range covered by membuf is // truncated (shrink) from the file. Note // that even Dirty and/or inuse membufs // can be truncated, as we don't care // for cached data belonging to the // truncated region of the file. Dirty = (1 << 3), // Data in membuf is newer than the Blob. Flushing = (1 << 4), // Data from dirty membuf is being synced // to Blob (as UNSTABLE or FILE_SYNC Write). CommitPending = (1 << 5), // Data from dirty membuf was synced using // UNSTABLE Write but it has not yet been // committed to the server. }; } /** * Memory buffer, used for caching chunks in memory. * For file-backed bytes_chunk_cache membufs are realized by mmap()ed memory, * while for non file-backed bytes_chunk_cache membufs are realized by memory * allocated on the heap. */ struct membuf { /** * Three things define a membuf: * 1. What offset inside the file it's caching. * 2. Count of bytes it's caching. * 3. Memory buffer address where the above data is stored. * 4. Backing file fd (in case of file-backed caches). * * We also have the bytes_chunk_cache backlink. This is strictly for * updating various cache metrics as membuf flags are updated. */ membuf(bytes_chunk_cache *_bcc, uint64_t _offset, uint64_t _length, int _backing_file_fd = -1); /** * membuf destructor, frees the memory used by membuf. * This frees heap allocated memory for non file-backed membufs, while * for file-backed membufs the mmap()ed memory is munmap()ed. * Since membuf is used as a shared_ptr, this will be called when the * last ref to the shared_ptr is dropped, typically when the bytes_chunk * referring to the membuf is discarded. */ ~membuf(); /* * bytes_chunk_cache to which this membuf belongs. * This is strictly for updating various cache metrics as membuf flags are * updated. * Those are atomic, hence it's ok to access w/o serializing access to * the cache. */ bytes_chunk_cache *bcc = nullptr; /* * This membuf caches file data in the range [offset, offset+length). * A membuf starts caching [initial_offset, initial_offset+initial_length), * but can be later trimmed to cache a smaller section of the file, * [offset, offset+length) as some release() call releases part of the * chunk. For trimming from right, only length needs to be reduced, while * for trimming from left, length is reduced and offset is increased. * The corresponding chunkmap[]'s offset, length and buffer_offset are also * updated accordingly and they should always be same as that of the membuf. * * initial_offset and initial_length hold the initial values that membuf * was created with, these remain unchanged for the life of the membuf. */ const uint64_t initial_offset; const uint64_t initial_length; std::atomic<uint64_t> offset; std::atomic<uint64_t> length; /* * Actual allocated length. This can be greater than length for * file-backed membufs. See comments above allocated_buffer. * Once set this will not change, even when the membuf is drop'ed and * allocated_buffer becomes nullptr. */ uint64_t allocated_length = 0; // Backing file fd (-1 for non file-backed caches). const int backing_file_fd = -1; /* * Data buffer used for cacheing. This will be mmap()ed memory for * file-backed caches and heap buffer for non file-backed ones. For non * file-backed caches, these will never be nullptr, but for file-backed * caches, nullptr means that the cache is dropped and we need to load the * data from the backing file. * * Since mmap() can only be done for page aligned file offsets, we need * allocated_buffer to track the page aligned mmap()ed address, while * buffer is the actual buffer address to use for storing cached data. * For non file-backed membufs, both will be same. * This also means that for file-backed caches the actual allocated bytes * is "length + (buffer - allocated_buffer)". See allocated_length. * * Once set buffer and allocated_buffer should not change. For file-backed * caches drop() will munmap() and set allocated_buffer to nullptr. * * TODO: trim() doesn't update buffer, which means membuf::buffer does not * point to the correct data cached by this membuf. Always use * bytes_chunk::get_buffer() to get the correct address. */ uint8_t *buffer = nullptr; uint8_t *allocated_buffer = nullptr; /* * If is_file_backed() is true then 'allocated_buffer' is the mmap()ed * address o/w it's the heap allocation address. */ bool is_file_backed() const { return (backing_file_fd != -1); } // Returns buffer address for storing the data. uint8_t *get() const { return buffer; } /** * Drop data cached in memory. * This is a no-op for non file-backed membufs since for them memory * is the only place where data is stored. For file-backed membufs this * drops data from the memory while the data is still present in the file. * load() can be used to reload data in memory cache. * * Returns the number of bytes reclaimed by dropping the cache. A -ve * return indicates error in munmap(). */ int64_t drop(); /** * Load data from file backend into memory. * This is a no-op for non file-backed membufs since for them the data * is always in memory. * load() assumes that the backing file is present and has a size at least * equal to offset+length, so that it can map valid data. The backing file * obviously will need to be invalidated if the file's data has changed * (conveyed by mtime/size change) and anytime the backing file is * invalidated all membufs referring to data inside the file MUST be * destroyed first. */ bool load(); uint32_t get_flag() const { return flag; } /** * Is membuf uptodate? * Only uptodate membufs are fit for reading. * A newly created membuf is not uptodate and must be set uptodate * after reading the required data from the Blob. */ bool is_uptodate() const { return (flag & MB_Flag::Uptodate); } void set_uptodate(); void clear_uptodate(); /* * wait-for-uptodate is a two step operation, the pre_unlock must be called * with the membuf locked while the post_unlock must be called after * releasing the membuf lock. post_unlock is the one that does the actual * waiting, if needed. */ void wait_uptodate_pre_unlock(); void wait_uptodate_post_unlock(); bool is_locked() const { const bool locked = (flag & MB_Flag::Locked); /* * XXX Following assert is usually true but for the rare case when * caller may drop the inuse count while holding the lock for * release()ing the chunk, this will not hold. * See read_callback() for such usage. */ #if 0 // If locked, must be inuse. assert(is_inuse() || !locked); #endif return locked; } /** * set_locked() returns true if it got the lock w/o having to wait. */ bool set_locked(); void clear_locked(); bool try_lock(); /** * A membuf is marked dirty when the membuf data is updated, making it * out of sync with the Blob contents for the range. This should be done * by writer threads which write application data into membuf. A dirty * membuf must be written to the Blob before it can be freed. Once written, * it should be marked not-dirty by calling clear_dirty(). */ bool is_dirty() const { const bool dirty = (flag & MB_Flag::Dirty); /* * Make sure is_dirty returns true only when is_uptodate() is true * otherwise we may write garbage data to Blob. * If membuf is dirty, it must not be in commit pending state. * A membuf goes to commit pending state only after it's successfully * flushed to the server (using UNSTABLE writes), and then it's no * longer dirty, and new application data cannot be copied over it * till it's committed. */ assert(!dirty || is_uptodate()); assert(!dirty || !(flag & MB_Flag::CommitPending)); return dirty; } void set_dirty(); void clear_dirty(); /** * Is this membuf truncated as a result of file truncate/shrink? * Truncated membufs are immediately removed from the cache, so this * is mainly for asserting in ~membuf(). */ bool is_truncated() const { return (flag & MB_Flag::Truncated); } void set_truncated(); bool is_flushing() const { const bool flushing = (flag & MB_Flag::Flushing); /* * If membuf is in flushing state, it means it can't be in * commit pending state. Only after successfully flushing * (using UNSTABLE writes) it goes to commit pending state. */ assert(!flushing || !(flag & MB_Flag::CommitPending)); /* * If flushing, must be dirty, but we cannot safely assert here * as those flags may be cleared while we are accessing them. */ #if 0 assert(!flushing || (flag & MB_Flag::Dirty)); #endif return flushing; } void set_flushing(); void clear_flushing(); bool is_commit_pending() const { const bool commit_pending = (flag & MB_Flag::CommitPending); /* * membuf can be marked CommitPending only after it's successfully * written to the server as unstable write. Till it's committed we * cannot copy new data to the membuf else we risk overwriting data * which is not yet committed to the server. */ assert(!commit_pending || !(flag & MB_Flag::Dirty)); assert(!commit_pending || !(flag & MB_Flag::Flushing)); return commit_pending; } void set_commit_pending(); void clear_commit_pending(); bool is_inuse() const { return (inuse > 0); } int get_inuse() const { return inuse; } void set_inuse(); void clear_inuse(); /** * trim 'trim_len' bytes from the membuf. 'left' should be true if trimming * is done from the left side of the membuf, else trimming is done from the * right side of the membuf. * It'll perform necessary validations and updations needed when a membuf is * trimmed. * Caller must make sure that this membuf is not being accessed by some other * thread. and trim() can safely update it. This means, trim() can be called * in one of the following cases: * 1. membuf is locked. * See bytes_chunk_cache::truncate(). * 2. chunkmap lock is held and membuf is not inuse. * See bytes_chunk_cache::scan()->safe_to_release(). * * Called from release() and truncate(), release() can trim both from left * and right while truncate() can only trim from right. */ void trim(uint64_t trim_len, bool left); private: /* * Lock to correctly read and update the membuf state. * * Note on safely accessing membuf * =============================== * Since multiple threads may be trying to read and write to the same file * or part of the file, we need to define some rules for ensuring consistent * access. Here are the rules: * * 1. Any reader or writer gets access to membuf by a call to * bytes_chunk_cache::get(). membufs are managed by shared_ptr, hence * the reader/writer is guaranteed that as long as it does not destroy * the returned bytes_chunk, the underlying membuf will not be freed. * Note that consistent read/write access though needs synchronization * among the various reader/writer threads, read on. * 2. A thread trying to write to the membuf must get exclusive access to * the membuf. It can get that by calling set_locked(). set_locked() * will block if the lock is already held by some other thread and will * return after acquiring the lock. Blocking threads will wait on the * condition_variable 'cv' and will be woken up when the current locking * thread unlocks. Note that membuf will be written under the following * cases: * i) A writer writes user passed data to the membuf. * ii) A reader reads data from the Blob into the membuf. * 3. Membufs also have an inuse count which indicates if there could be * an ongoing IO (whether there is actually an ongoing IO can be * found by using the locked bit). The purpose of inuse count is to * just mark the membuf such that clear() doesn't clear membufs which * might soon afterwards have IOs issued. * bytes_chunk_cache::get() will bump the inuse count of all membufs * it returns since the caller most likely might perform IO on the * membuf. It's caller's responsibility to clear the inuse by calling * clear_inuse() once they are done performing the IO. This should be * done after performing the IO, and releasing the lock taken for the * IO. * 4. A newly created membuf does not have valid data and hence a reader * should not read from it. Such an membuf is "not uptodate" and a * reader must first read the corresponding file data into the membuf, * and mark the membuf "uptodate" after successfully reading the data * into it. It can do that after getting exclusive access to membuf * by calling set_locked(). Any other reader which accesses the membuf * in the meantime will find it "not update" and it'll try to update * the membuf itself but it'll find the membuf locked, so set_locked() * will cause the thread to wait on 'cv'. Once the current reader * updates the membuf, it marks it "uptodate" by calling set_uptodate() * and then unlock it by calling clear_locked(). Other readers waiting * for the lock will get woken up and they will discover that the * membuf is uptodate by checking is_uptodate() and they can then read * that data into their application data buffers. * * IMPORTANT RULES FOR UPDATING THE "UPTODATE" BIT * =============================================== * - Any reader that gets a bytes_chunk whose membuf is not uptodate, * must try to read data from the Blob, but only mark it uptodate if * is_whole was also true for the bytes_chunk. This is because * is_whole will be true for bytes_chunk representing "full membuf" * (see UTILIZE_TAILROOM_FROM_LAST_MEMBUF) and hence they only can * correctly mark the membuf as uptodate. Other readers, if they get * the lock first, they can issue the Blob read to update the part * of membuf referred by their bytes_chunk, and return that data to * fuse, but they cannot mark the membuf as uptodate, so future users * cannot benefit from their read. * So, ONLY IF maps_full_membuf() returns true for a bytes_chunk, the * reader MUST mark the membuf uptodate. * * - Writers must set the uptodate bit only if they write the entire * membuf (maps_full_membuf() returns true), else they should not * change the uptodate bit. * * 5. If a reader finds that a membuf is uptodate (as per is_uptodate()), it * can return the membuf data to the application. Note that some writer * may be writing to the data simultaneously and reader may get a mix * of old and new data. This is fine as per POSIX. Users who care about * this must synchronize access to the file. * 6. Once an membuf is marked uptodate it remains uptodate for the life * of the membuf, unless one of the following happens: * i) We detect via file mtime change that our cached copy is no longer * valid. In this case the entire cache for that file is clear()ed * which causes all bytes_chunk and hence all membufs to be freed. * ii) An NFS read from the given portion of Blob fails. We will need to * understand the effects of this better, since we normally never * fail an NFS IO (think hard mount). * 7. A writer MUST observe the following rules: * i) If writer is writing to a part of the membuf, it MUST ensure * that membuf is uptodate before it can modify part of the membuf. * It must do that by reading the *entire* membuf from the Blob, * and marking the membuf as uptodate. Then it must update the part * it wants to. After writing it must mark the membuf as dirty. * All of this must be done while holding the lock on the membuf as * we want the membuf update to be atomic. * ii) If writer is writing to the entire membuf (maps_full_membuf() * returns true), it can directly write to the membuf even if it's * not already uptodate, and after writing it must mark the membuf * as uptodate and dirty. * 7. A writer must mark the membuf dirty by calling set_dirty(), after it * updates the membuf data. Dirty membufs must be synced with the Blob * at some later time and once those writes to Blob succeed, the membuf * dirty flag must be cleared by calling clear_dirty(). Note that a * dirty membuf is still uptodate since it has the latest content for * the reader. */ std::mutex mb_lock_44; /* * Flag bitmap for correctly defining the state of this membuf. * This is a bitwise or of zero or more MB_Flag values. * * Note: membuf::flag represents the state of the entire membuf, * irrespective of the offset within the membuf a particular * bytes_chunk represents. This means even if one thread has to * read say 1 byte but the actual bytes_chunk created by another * thread is of size 1GB, the former thread has to wait till the * entire 1GB data is read by the other thread and the membuf is * marked MB_Flag::Uptodate. * This means very large membufs will cause unnecessary waits. * Test out and find a good value for AZNFSC_MAX_CHUNK_SIZE. */ std::atomic<uint32_t> flag = 0; // For managing threads waiting on MB_Flag::Locked. std::condition_variable cv; /* * Incremented by bytes_chunk_cache::get() before returning a membuf to * the caller. Caller must decrement it once they are done reading or * writing the membuf. */ std::atomic<uint32_t> inuse = 0; }; /** * This represents one contiguous chunk of bytes in bytes_chunk_cache. * bytes_chunk_cache consists of zero or more bytes_chunk ordered by offset. * Note that a byte range can be cached using one or more bytes_chunk and the * size of the individual component bytes_chunk depends on the order in which * the application writes data to the file. * A contiguous file range cached by a series of bytes_chunk is called an * "extent". Extents are important as they decide if/when we can issue full * block-sized write to the Blob. */ struct bytes_chunk { // bytes_chunk_cache needs to access the private member alloc_buffer. friend bytes_chunk_cache; private: // bytes_chunk_cache to which this chunk belongs. bytes_chunk_cache *bcc = nullptr; /* * This is the underlying membuf. The actual buffer where data is stored * can be found by adding buffer_offset to this, and can be retrieved using * the convenience function get_buffer(). buffer_offset is typically 0 but * it can be non-zero when multiple chunks are referring to the same buffer * but at different offsets (e.g., cache trimming). * Any chunk that refers to the same allocated buffer will hold a ref to * alloc_buffer, so alloc_buffer will be freed when the last ref is dropped. * This should typically happen when the chunk is freed. * * To find the length of the allocated buffer, use alloc_buffer->length. */ std::shared_ptr<membuf> alloc_buffer; public: /* * Offset from the start of file this chunk represents. * For bytes_chunks stored in chunkmap[] this will be incremented to trim * a bytes_chunk from the left and must always match membuf::offset while * for non-chunkmap bcs (those returned by get()/getx()) this will be the * copy of membuf::offset at the time the bc was created, membuf can be * later trimmed and its offset may increase. */ uint64_t offset = 0; /* * Length of this chunk. * User can safely access [get_buffer(), get_buffer()+length). * For bytes_chunks stored in chunkmap[] this will be reduced to trim * a bytes_chunk from left and right and must always match membuf::length * while for non-chunkmap bcs (those returned by get()/getx()) this will be * the copy of membuf::length at the time the bc was created, membuf can be * later trimmed and its length may reduce. */ uint64_t length = 0; /* * Offset of buffer from alloc_buffer->get(). * For bytes_chunks stored in chunkmap[] this will be incremented to trim * a bytes_chunk from the left. */ uint64_t buffer_offset = 0; /* * Private data. User can use this to store anything they want. but * most commonly it's used to update the progress as the bc can be read * or written in parts. Hence "bc.offset + bc.pvt" is the next offset * to read/write and "bc.length - bc.pvt" is the remaining length to * read/write and "bc.get_buffer() + bc.pvt" is the address where the * data must be read/written. * * This is opaque to the cache and cache doesn't use it. Hence for bcs * stored in the chunkmap, pvt will be 0. * * TODO: Shall we designate pvt for this specific job and rename this to * something more specific like cursor. */ uint64_t pvt = 0; /* * Number of backend calls issued to sync this byte chunk with the backing * blob. It could be read call(s) to read data from the blob or it could be * write call(s) to sync dirty byte chunk. * * Note: Values greater than 1 signify partial read/write calls. */ int num_backend_calls_issued = 0; /* * bytes_chunk is a window/view into the underlying membuf which holds the * actual data. It can refer to the entire membuf or any contiguous part * of it. This tells if a bytes_chunk refers to the full membuf or a part. * This is useful for the caller to know as based on this they can decide * how the membuf flags need to be updated when peforming IOs through this * bytes_chunk. f.e., if a reader has a bytes_chunk refering to a partial * membuf and they perform a successful read, they cannot mark the membuf * uptodate as they have not read data for the entire membuf. * OTOH if a write has a bytes_chunk refering to a partial membuf and they * write data into the bytes_chunk they MUST mark the membuf dirty as * updating even a single byte makes the membuf dirty. At the same time if * the membuf is not uptodate, the writer cannot simply copy into the * bytes_chunk, as only uptodate membufs can be partially updated. * * Note: Note that trimming doesn't make changes to the membuf but instead * it changes the corresponding bytes_chunk in the chunkmap[], so * is_whole indicates whether a bytes_chunk returned by get() * refers to the complete bytes_chunk in the chunkmap[] or part of it. * * Note: This is set at the time when the bytes_chunk is created (and * returned by get()/getx()/scan()), depending on whether the * returned bytes_chunk fully covers the corresponding chunkmap * bytes_chunk or it refers to only partial chunkmap bytes_chunk. * If chunkmap bytes_chunk changes afterwards, this may not be * correct. Note that an inuse chunkmap bytes_chunk can (only) be * updated by a truncate() call. */ bool is_whole = false; /* * is_new indicates whether a bytes_chunk returned by a call to * bytes_chunk_cache::get() refers to a freshly allocated membuf or it * refers to an existing membuf. is_new implies that membuf doesn't contain * valid data and hence the uptodate membuf flag must be false. It only * makes sense for bytes_chunk returned by bytes_chunk_cache::get(), and not * for bytes_chunk which are stored in bytes_chunk_cache::chunkmap. * Since membufs are allocated to fit caller's size req, bytes_chunk with * is_new set MUST have is_whole also set. */ bool is_new = false; /** * Get the inode corresponding to this bc. */ struct nfs_inode *get_inode() const; /** * Return membuf corresponding to this bytes_chunk. * This will be used by caller to synchronize operations on the membuf. * See membuf::flag and various operations that can be done on them. */ struct membuf *get_membuf() const { struct membuf *mb = alloc_buffer.get(); // membuf must have valid alloc_buffer at all times. assert(mb != nullptr); return mb; } /** * Returns usecount for the underlying membuf. * A bytes_chunk added only to bytes_chunk_cache::chunkmap has a usecount * of 1 and every user that calls get() will get one usecount on the * respective membuf. */ int get_membuf_usecount() const { return alloc_buffer.use_count(); } /** * Start of valid cached data corresponding to this chunk. * This will typically have the value alloc_buffer->get(), i.e., it points * to the start of the data buffer represented by the shared pointer * alloc_buffer, but if some cached data is deleted from the beginning of a * chunk, causing the buffer to be "trimmed" from the beginning, this can * point anywhere inside the buffer. */ uint8_t *get_buffer() const { // Should not call on a dropped cache. assert(alloc_buffer->get() != nullptr); /* * buffer_offset should point to a valid membuf byte. * Note that this bytes_chunk is a snapshot of the chunkmap's * bytes_chunk at some point in the past (and it not necessarily the * chunkmap bc, which should always be in sync with membuf), so it's * possible that after this bc was returned by a get()/getx() call, * user may have release()d the chunk which would cause membuf to be * trimmed, so we cannot safely compare with the current membuf length * and offset fields, but one thing we can say for sure is that * buffer_offset should never exceed the membuf's initial_length. */ assert(buffer_offset < alloc_buffer->initial_length); return alloc_buffer->get() + buffer_offset; } /** * Does this bytes_chunk cover the "full membuf"? * Note that "full membuf" refers to membuf after trimming if any. */ bool maps_full_membuf() const { assert(!is_new || is_whole); return is_whole; } /** * Is it safe to release (remove from chunkmap) this bytes_chunk? * bytes_chunk whose underlying membuf is either inuse or dirty or commit * pending are not safe to release because: * - dirty indicates membuf has some data which needs to be flushed, so * we cannot release it w/o flushing the data. * - inuse indicates some other thread is doing something with the membuf, * maybe it's writing fresh data to it and may mark it dirty. If we * allow such membuf to be released, future readers who get() the cache * will miss those changes. * - commit pending means those membufs are sitting in the TBL of the Blob, * not yet committed, server may fail the commit in which case we might * have to resend those to the server and hence we cannot free those * till successful commit. * * Note: Since we do not allow inuse membufs to be released, it means * if caller is owning the membuf they must drop their inuse count * before calling release(). */ bool safe_to_release() const { const struct membuf *mb = get_membuf(); return !mb->is_inuse() && !mb->is_dirty() && !mb->is_commit_pending(); } /** * Does this bytes_chunk need to be flushed? * bytes_chunk whose underlying membuf is dirty and not already being * flushed, qualify for flushing. */ bool needs_flush() const { const struct membuf *mb = get_membuf(); return mb->is_dirty() && !mb->is_flushing(); } /** * Constructor to create a brand new chunk with newly allocated buffer. * This chunk is the sole owner of alloc_buffer and 'buffer_offset' is 0. * Later as this chunk is split or returned to the caller through get(), * alloc_buffer may have more owners. When the last owner releases claim * alloc_buffer will be freed. This should happen when the chunk is freed. * * XXX: If we need to gracefully handle allocation failure, the buffer * allocation must be done by the caller. * * XXX Default std new[] implementation is very slow, use tcmalloc for * much faster perf. The main problem with std new is that it doesn't * use memory pools and for large allocations it gets/releases memory * to the system, which causes zero'ing overhead as kernel has to * zero pages. */ bytes_chunk(bytes_chunk_cache *_bcc, uint64_t _offset, uint64_t _length); /** * Constructor to create a chunk that refers to alloc_buffer from another * existing chunk. The additional _buffer_offset allows flexibility to * each chunk to point anywhere inside alloc_buffer. * This is useful for chunks created due to splitting or when returning * bytes_chunk from bytes_chunk_cache::get(). */ bytes_chunk(bytes_chunk_cache *_bcc, uint64_t _offset, uint64_t _length, uint64_t _buffer_offset, const std::shared_ptr<membuf>& _alloc_buffer, bool _is_whole = true, bool _is_new = false); /** * Copy constructor, only for use by test code. */ bytes_chunk(const bytes_chunk& rhs) : bytes_chunk(rhs.bcc, rhs.offset, rhs.length, rhs.buffer_offset, rhs.alloc_buffer, rhs.is_whole, rhs.is_new) { // new bytes_chunk MUST cover whole membuf. assert(!is_new || is_whole); pvt = rhs.pvt; num_backend_calls_issued = rhs.num_backend_calls_issued; } /** * Default constructor and assignment operator, only for use by test code. */ bytes_chunk() = default; bytes_chunk& operator=(const bytes_chunk&) = default; #ifdef UTILIZE_TAILROOM_FROM_LAST_MEMBUF /** * Return available space at the end of buffer. * This is usually helpful when a prev read() was short and could not fill * the entire buffer and then a subsequent read() is issued to fill * subsequent data. */ uint64_t tailroom() const { const int64_t tailroom = (alloc_buffer->length - (buffer_offset + length)); assert(tailroom >= 0); assert(tailroom <= (int64_t) AZNFSC_MAX_CHUNK_SIZE); return tailroom; } #endif /** * Drop data cached in memory, for this bytes_chunk. * * Returns the number of bytes reclaimed by dropping the cache. A -ve * return indicates error in munmap(). */ int64_t drop() { assert(get_membuf_usecount() > 0); /* * If the membuf is being used by someone else, we cannot drop/munmap * it, o/w users accessing the data will start getting errors. */ if (get_membuf_usecount() == 1) { return alloc_buffer->drop(); } return 0; } /** * Load data from file backend into memory, for this bytes_chunk. */ void load() { [[maybe_unused]] const bool ret = alloc_buffer->load(); assert(ret); } }; /** * bytes_chunk_cache::scan() can behave differently depending on the scan_action * passed. */ enum class scan_action { SCAN_ACTION_INVALID = 0, SCAN_ACTION_GET, SCAN_ACTION_RELEASE, }; /** * This is the per-file cache that caches variable sized extents and is * indexed using byte offset and length. * * Note on read/write performance using bytes_chunk_cache * ====================================================== * If you use file-backed bytes_chunk_cache then the performance of that * will be limited by the backing file read/write performance as the data * read from the NFS server is placed into the read buffers which are actually * the mmap()ed buffers, hence the steady state write performance will be * limited by the file write throughput. Having said that, if you have large * amount of RAM and the file being read can fit completely in RAM, then the * read will happen very fast and then the data can be flushed to the backing * file later. * OTOH, if you use non file-backed cache, and make sure you release the * chunks as they are read from the server, then the read performance is only * limited by the memory write speed. * Similar log applies to write. */ class bytes_chunk_cache { friend membuf; friend bytes_chunk; public: bytes_chunk_cache(struct nfs_inode *_inode, const char *_backing_file_name = nullptr); ~bytes_chunk_cache(); /** * Call this to check if the cache is empty, i.e., newly allocated. */ bool is_empty() const { /* * TSAN Warning. * FIXME: * If we call it while bytes_chunk_cache::scan() is adding to chunkmap, * TSAN complains of data race. * We need to fix this, though usually the caller is not strictly * depending on the result returned by this, as the cache can change * right after the call. */ return chunkmap.empty(); } /** * Return a vector of bytes_chunk that cache the byte range * [offset, offset+length). Parts of the range that correspond to chunks * already present in the cache will refer to those existing chunks, for * such chunks is_new will be set to false, while those parts of the * range for which there wasn't an already cached chunk found, new chunks * will be allocated and inserted into the chunkmap. These new chunks will * have is_new set to true. This means after this function successfully * returns there will be chunks present in the cache for the entire range * [offset, offset+length). * * This can be called by both, * - writers, who want to write to the specified range in the file. * The returned chunks is a scatter list where the caller should write. * bytes_chunk::buffer is the buffer corresponding to each chunk where * caller should write bytes_chunk::length amount of data. * - readers, who want to read the specified range from the file. * The returned chunks is a scatter list containing the data from the * file. * * TODO: Reuse buffer from prev/adjacent chunk if it has space. Currently * we will allocate a new buffer, this works but is wasteful. * e.g., * get(0, 4096) * release(10, 4086) <-- this will just update length but the buffer * will remain. * get(10, 4086) <-- this get() should reuse the existing buffer. * * Update: This is now done, but we still haven't generalized the * solution to reuse buffer for all cases, but the most * common case is now addressed! Leaving the TODO for * tracking the generalized case. * Update2: This introduces challenges, so it's turned off for now. * see UTILIZE_TAILROOM_FROM_LAST_MEMBUF. * * Note: Caller must do the following for correctly using the returned * bytes_chunks: * * 1. Since get() increments the inuse count for each membuf it * returns, caller must call clear_inuse() once it's done * performing IO on the membuf. For writers it'll be after they * are done copying application data to the membuf and marking * it dirty, and for readers it'll be after they are done reading * data from the Blob into the membuf (if it's not already * uptodate). Once the caller drops inuse count * bytes_chunk_cache::clear() can potentially remove the membuf * from the cache, so the caller must make sure that it drops * inuse count only after correctly setting the state, * i.e., call set_dirty() after writing to the membuf. * 2. IOs can be performed to the membuf only after locking it using * set_locked(). Once the IO completes release the lock using * clear_locked(). This must be done before calling clear_inuse(). * So the logical seq of operations are: * >> get() * >> for each bytes_chunk returned * >> set_locked() * >> perform IO * >> clear_locked() * >> clear_inuse() * * Note: Usually inuse count should be dropped after the lock is released, * but there's one case where you may drop inuse count while the lock * is held. This is if you want to call bytes_chunk_cache::release() * but want the lock for setting the membuf uptodate f.e. * See read_callback() for such usage. */ std::vector<bytes_chunk> get(uint64_t offset, uint64_t length) { num_get++; num_get_g++; bytes_get += length; bytes_get_g += length; /* * Perform inline pruning if needed. * We do inline pruning when we are "extremely" high on memory usage * and hence cannot proceed w/o making space for the new request. */ inline_prune(); return scan(offset, length, scan_action::SCAN_ACTION_GET, nullptr /* bytes_released */, nullptr /* extent_left */, nullptr /* extent_right */); } /** * Same as get() but to be used by writers who want to write to the given * cache range but are also interested in knowing when enough dirty data is * accumulated that they may want to flush/sync. Caller can pass two * uint64_t pointers to find out the largest contiguous dirty byte range * containing the requested byte range. Based on their wsize setting or some * other criteria, caller can then decide if they want to flush the dirty * data. * If extent_left and extent_right are non-null, on completion they will * hold the left and right edges of the extent containing the range * [offset, offset+length). Note that an extent is a collection of one or * more membufs which cache contiguous bytes. * * Note: [extent_left, extent_right) range contains one or more *full* * membufs, also those membufs are dirty and not already flushing. * If [offset, offset+length) falls on existing membuf(s) then we * include those in [extent_left, extent_right) irrespective of the * dirty/flushing flags since part of the membuf(s) is going to be * updated and membuf will become dirty and hence would need to be * flushed. BUT, as usual the caller must check the uptodate flag to * decide if it needs to do a Read-Modify-Write before flushing. * * Note: Once the getx() call returns the extent details, since it doesn't * hold membuf lock on any of the membufs in the extent range, some * other thread can potentially initiate sync/flush of the membuf(s). * Though this should not be common since the thread writing data * is the first one to know about it, but depending on whether you * have parallel writers or some periodic flusher thread, it can * happen. */ std::vector<bytes_chunk> getx(uint64_t offset, uint64_t length, uint64_t *extent_left, uint64_t *extent_right) { num_get++; num_get_g++; bytes_get += length; bytes_get_g += length; /* * Perform inline pruning if needed. * We do inline pruning when we are "extremely" high on memory usage * and hence cannot proceed w/o making space for the new request. */ inline_prune(); return scan(offset, length, scan_action::SCAN_ACTION_GET, nullptr /* bytes_released */, extent_left, extent_right); } /** * Try and release chunks in the range [offset, offset+length) from * chunkmap. Only chunks which are fully contained inside the range would * be released, while chunks which lie partially in the range are trimmed * (by updating the buffer, length and offset members). These will be * released later when a future release() call causes them to contain no * valid data. release() will skip/ignore any byte range that's not * currently cached. * * Note that a release() call is an advice and not an order. Only those * chunks will be released which are not actively being used. Following * chunks won't be released: * - Which are inuse. * These may have ongoing IOs, so not safe to release. * - Which are dirty. * These need to be flushed to the Blob, else we lose data. * * Additionally, release() will *not* trim chunks unless the release()d * range aligns with either the left or the right edge, i.e., for ranges * falling in the middle of a chunk will be skipped. * * If release() successfully releases one or more chunks, a subsequent * call to get() won't find them in the chunkmap and hence will allocate * fresh chunk (with is_new true). * * Note that release() removes the chunks from chunkmap and drops the * original ref on the membufs. The membuf itself won't be freed till the * last ref on it is dropped, i.e., users can safely access membuf(s) * returned by get() even if some other thread calls release(). * * It returns the number of bytes actually released. These could be full * chunks or partial chunks (both of which are not currently in use). * Caller can use this to decide if he wants to update the membuf flags * f.e., if a reader gets a bc of 100 bytes but when it read the backing * file it got eof after 10 bytes, it should try to release [10, 100) * byte range. If it's able to release successfully that would mean that * he is the sole owner and hence it can mark it uptodate, else it cannot * release and cannot mark uptodate. * * Note: For releasing all chunks and effectively nuking the cache, use * clear(), but note that clear() also won't release above chunks, * for which safe_to_release() returns false. */ uint64_t release(uint64_t offset, uint64_t length) { uint64_t bytes_released; num_release++; num_release_g++; scan(offset, length, scan_action::SCAN_ACTION_RELEASE, &bytes_released); assert(bytes_released <= length); bytes_release += bytes_released; bytes_release_g += bytes_released; return bytes_released; } /** * Truncate cache to 'trunc_len' bytes. * Any cached byte(s) with offset >= 'trunc_len' will be removed from the * cache. All fully truncated bcs (having bc.offset >= 'trunc_len') will be * removed from chunkmap. Note that 'trunc_len' can fall inside a bc, such * partially truncated bc will be trimmed from the right to remove any bytes * after 'trunc_len'. * There are two ways to call truncate(), with 'post' as false or true. * If 'post' is false, it'll lock all the affected bcs, so it may need to wait * for ongoing IOs on those membufs to complete, hence it can take a long time * depending on how many bytes_chunk are affected by truncate and if there are * IOs ongoing on those. If 'post' is false then it calls membuf::try_lock() * and skips truncating the membufs which it could not lock. * Caller will typically call with post=false once at the beginning from a * context that can afford to wait, make sure through external means that no * new data is added to the cache, and then finally call once with post=true * to cleanup anything that was added after the prev call. * * Note: Since truncate() does not return with the chunkmap lock held, a * get() call done right after truncate() returns can add new data to * the truncated region. Caller should make sure through some other * means that new data is not added. * * Returns number of chunks that must have been deleted or trimmed, but * were skipped as they were inuse. Since VFS writes are blocked during * truncate those chunks could be in use only because of some ongoing * read calls. Caller will typically wait and call truncate() again, till * it returns 0, indicating that all chunks in the region have been * truncated. * bytes_truncated is the number of bytes dropped from the cache to serve * this truncate request. */ int truncate(uint64_t trunc_len, bool post, uint64_t& bytes_truncated); /* * Returns all dirty chunks for a given range in chunkmap. * Before returning it increases the inuse count of underlying membuf(s). * Caller will typically sync dirty membuf to Blob and once done must call * clear_inuse(). * * Note: The returned membuf(s) were found dirty when get_dirty_bc_range() * scanned the chunmap while holding the chunkmap lock. They may or * may not be flushing. * Since returned membuf(s) are not locked by get_dirty_bc_range(), * some other thread may already be flushing them or may start * flushing anytime after the call returns, hence the caller MUST * check for that after holding the membuf lock, before it tries to * flush those membuf(s). */ std::vector<bytes_chunk> get_dirty_bc_range( uint64_t st_off = 0, uint64_t end_off = UINT64_MAX) const; /* * Returns dirty chunks which are not already flushing, in the given range, * from chunkmap. If bytes pointer is passed it's populated with the total * number of bytes included in the returned bytes_chunk vector. * Before returning it increases the inuse count of underlying membuf(s). * Caller will typically sync dirty membuf to Blob and once done must call * clear_inuse(). * * Note: Caller MUST call get_dirty_nonflushing_bcs_range() with flush_lock * held. * This ensures that none of the chunks returned starts flushing by * any other thread (as any new flush will wait for the flush_lock). * flush_lock can be released after sync_membufs() is called for the * returned chunks. It'll set dirty membufs to flushing, and issue * write_rpc. These chunks won't be returned by a subsequent call to * get_dirty_nonflushing_bcs_range(). */ std::vector<bytes_chunk> get_dirty_nonflushing_bcs_range( uint64_t st_off = 0, uint64_t end_off = UINT64_MAX, uint64_t *bytes = nullptr) const; /* * Returns all dirty chunks which are currently flushing for a given range * in chunkmap. Before returning it increases the inuse count of underlying * membuf(s). * Caller will typically wait for these membuf(s) to complete flushing to * Blob and once done must call clear_inuse(). * * Note: The returned membuf(s) were found flushing when get_flushing_bc_range() * scanned the chunmap while holding the chunkmap lock. * Since returned membuf(s) are not locked by get_flushing_bc_range(), * they may complete flushing and (optionally) marked commit pending * if they were flushed using UNSTABLE write, hence the caller MUST * check for that after holding the membuf lock, before it tries to * commit those membuf(s). */ std::vector<bytes_chunk> get_flushing_bc_range( uint64_t st_off = 0, uint64_t end_off = UINT64_MAX) const; /** * Returns contiguous dirty (and not flushing) chunks from chunmap, starting * with the lowest dirty offset, and returns the total number of (dirty) * bytes contained in the returned chunks. * Before returning it increases the inuse count of underlying membuf(s). * Caller will typically flush these to the backing Blob as UNSTABLE * writes. */ std::vector<bytes_chunk> get_contiguous_dirty_bcs( uint64_t *bytes = nullptr) const; /* * Returns *all* commit pending chunks in chunkmap. * Before returning it increases the inuse count of underlying membuf(s) * and sets the membufs locked. Caller will typically commit the returned * membuf(s) to Blob and once done must call clear_commit_pending, * clear_locked() and clear_inuse() in that order. * * Note: We don't have use case where we need to commit for given range as * server on commiting delete all the temporary Put Blocks. * * Note: The returned membuf(s) were found pending commit when * get_commit_pending_bcs() scanned the chunmap while holding the * chunkmap lock. Since returned membuf(s) are not locked by * get_commit_pending_bcs(), some other thread may already start * committing them anytime after the call returns, hence the caller * MUST check for that after holding the membuf lock, before it tries * to commit those membuf(s). */ std::vector<bytes_chunk> get_commit_pending_bcs( uint64_t *bytes = nullptr) const; /** * Drop cached data in the given range. * This must be called only for file-backed caches. For non file-backed * caches this is a no-op. * * Returns the number of bytes reclaimed by dropping the cache. A -ve * return indicates error in munmap(). * * Note: It might make sense to not call drop() at all and leave all chunks * mmap()ed at all times, and depend on kernel to manage the buffer * cache. If kernel drops some part of the buffer cache, subsequent * users of that byte range would cause a page fault and kernel will * silently load data from the backing file. * Another approach would be to use mlock() to lock buffer cache data * that we want and let drop() munlock() it so that kernel can choose * to free it. This needs to be tested. * The advantage of having drop support is that we can choose to * drop specific file caches, which are less/not used, and leave the * more actively used caches mapped. Kernel won't have this knowledge * and it can flush any of the file caches under memory pressure. */ int64_t drop(uint64_t offset, uint64_t length); /** * Clear the cache by releasing all chunks from the cache. * For file-backed cache, this also releases all the file blocks. * This will be called for invalidating the cache for a file, typically * when we detect that file has changed (through getattr or preop attrs * telling that mtime is different than what we have cached) or when * the file inode is forgotten by fuse and we don't want to keep the cache * anymore. For the latter case, clear_nolock() must be called with * shutdown param as true. * * When shutdown is false, following chunks won't be released. * - Which are inuse. * These may have ongoing IOs, so not safe to release. * - Which are dirty. * These need to be flushed to the Blob, else we lose data. * When shutdown is true it means the caller wants to purge the cache and * the file is no longer being used, so we release all chunks irrespective * of their current state. */ void clear_nolock(bool shutdown = false); void clear(bool shutdown = false) { const std::unique_lock<std::mutex> _lock(chunkmap_lock_43); clear_nolock(shutdown); } /** * Mark the cache as invalid. * Should be called when it's determined that the cached data is not valid. * Depending on the requirement, caller may want to sync the dirty data * with the sever before prceeding. */ void invalidate() { invalidate_pending = true; } /** * Atomically clear invalidate_pending and return the old value. */ bool test_and_clear_invalidate_pending() { return invalidate_pending.exchange(false); } /** * Drop memory cache for all chunks in this bytes_chunk_cache. * Chunks will be loaded as user calls get(). * * Returns the number of bytes reclaimed by dropping the cache. A -ve * return indicates error in munmap(). * * See discussion in drop(). */ int64_t dropall() { return drop(0, UINT64_MAX); } bool is_file_backed() const { return !backing_file_name.empty(); } /** * Maximum size a dirty extent can grow before we should flush it. * This is 60% of the allowed cache size or 1GB whichever is lower. * The reason for limiting it to 1GiB (or 10 x AZNFSC_MAX_BLOCK_SIZE) is * because there's not much value in holding more data than the Blob * NFS server's scheduler cache size. We want to send as prompt as * possible to utilize the n/w b/w but slow enough to give the write * scheduler an opportunity to merge better. * For unstable writes this allows us enough PB parallelism. */ static uint64_t max_dirty_extent_bytes(); /** * Get the amount of dirty data that needs to be flushed. * This excludes the data which is already flushing. * Note that once a thread starts flushing one or more membufs the dirty * counter doesn't reduce till the writes complete but another thread * looking to flush should not account for those as they are already * being flushed. * * When called with inode->flush_lock() held, bytes to flush can go up * after get_bytes_to_flush() returns but it cannot go down. * This is because with flush_lock held no new flush can start and hence * bytes_flushing cannot go up, but as ongoing flushes complete, * bytes_flushing can go down. Otoh, bytes_dirty can go up as * copy_to_cache() doesn't need the flush_lock. */ uint64_t get_bytes_to_flush() const { /* * Since we call clear_dirty() before clear_flushing(), we can have * bytes_dirty < bytes_flushing, hence we need the protection. */ return std::max((int64_t)(bytes_dirty - bytes_flushing), int64_t(0)); } /** * Get the amount of data which has been written as unstable writes, but not * yet committed. This excludes dirty data which is not flushed/written yet * or in process of flushing. It gets incremented on write completion of * dirty data flushed to Blob with unstable parameter. * * Note: The number of bytes to commit may change after the call returns. * Caller should use it as a hint, and not as a hard limit. */ uint64_t get_bytes_to_commit() const { assert(bytes_commit_pending <= AZNFSC_MAX_FILE_SIZE); return bytes_commit_pending; } /** * Returns true if one or more membufs are being currently flushed to the * backing Blob. * * Note: is_flushing_in_progress() should be called with flush_lock() held. * * Note: is_flushing_in_progress() can return false positives, i.e., it can * return true as bytes_flushing can change anytime after this call * returns, but it never returns false negatives, i.e., it never * returns false when bytes_flushing is non-zero. */ bool is_flushing_in_progress() const { return bytes_flushing > 0; } /** * Maximum size of commit_pending data that can be in cache, before we * must commit it to Blob. * It should be greater than or equal to the flush threshold (as returned * by max_dirty_extent_bytes()) and smaller than the inline write threshold * (as suggested by do_inline_write()), to minimize inline flush waits as * much as possible, in steady state. */ static uint64_t max_commit_bytes() { // Maximum cache size allowed in bytes. static const uint64_t max_total = (aznfsc_cfg.cache.data.user.max_size_mb * 1024 * 1024ULL); assert(max_total != 0); /* * Minimum of 60% of max cache and 2 times the flush limit. * We want to commit as soon as possible w/o affecting performance. * If we commit too often, since commit is a serializing operation, * it'll affect the write throughput, otoh, if we commit too late * then we might hit the inline write threshold, which again would * serialize writes, bringing down throughput. */ // Capped due to global cache size. static const uint64_t max_commit_bytes_g = (max_total * 0.6); // Capped due to per-file cache discipline. static const uint64_t max_commit_bytes_l = 2 * max_dirty_extent_bytes(); static const uint64_t max_commit_bytes = std::min(max_commit_bytes_g, max_commit_bytes_l); // At least one full sized block. assert(max_commit_bytes >= AZNFSC_MAX_BLOCK_SIZE); return max_commit_bytes; } /** * Check if we must initiate a COMMIT RPC now. Note that the caller would * just send the COMMIT RPC and not necessarily block the user write * request till the COMMIT RPC completes, i.e., it's not an inline commit. * * We must start commit if: * 1. We have enough commit_pending data for this file/cache, or, * 2. Global memory pressure dictates that we commit now to free up * memory. In this case we might be committing more frequently which * won't necessarily be optimal, but we have no choice due to the * memory pressure. */ bool commit_required() const { const bool local_pressure = (bytes_commit_pending >= max_commit_bytes()); if (local_pressure) { INC_GBL_STATS(commit_lp, 1); return true; } /* * TODO: Take cue from global memory pressure. */ return false; } /** * Check if we must initiate flush of some cached data. Note that the caller * would just send the corresponding WRITE RPC and not necessarily block the * user write request till the WRITE RPC completes, i.e., it's not an inline * write. * * We must start flush/write if: * 1. We have enough bytes to flush so that we can write a full sized * block, or for the case of stable write, we have enough data to fill * the scheduler queue. * 2. Global memory pressure dictates that we flush now to free up memory. * In this case we might be flushing more frequently which won't * necessarily be optimal, but we have no choice due to the memory * pressure. */ bool flush_required(uint64_t extent_size = 0) const { static const uint64_t mdeb = max_dirty_extent_bytes(); if (extent_size >= mdeb) { INC_GBL_STATS(flush_seq, 1); return true; } const bool local_pressure = (get_bytes_to_flush() >= mdeb); if (local_pressure) { INC_GBL_STATS(flush_lp, 1); return true; } /* * TODO: Take cue from global memory pressure. */ return false; } /** * This should be called by writer threads to find out if they must wait * for the write to complete. This will check both the cache specific and * global memory pressure. */ bool do_inline_write() const { /* * Allow four full-sized dirty extents before we force inline write. * This 4GB dirty data could be sitting in any combination of commit * pending, flushing and dirty. Let's see how the flush, commit and * dirty bytes would play out. * * e.g., * if the max_dirty_extent_bytes() is 1GB, then we have * flush_required() @ 1GB * commit_required() @ 2GB * do_inline_write() @ 4GB. * * Assuming backend flush speed of 1GB/s and memory write speed of * 5GB/s and typical commit time of ~50ms, we have the following * derived facts: * - flush of 1GB extent will take ~1sec. * - during this time application can dirty 5GB more data, so we * are likely to hit inline write limit even before the flush * completed and commit starts. If we consider slightly lesser * write speeds of say 2GB/s, then by the time the flush completes * we have dirtied another 2GB. Now flush for the next 1GB extent * will start and it'll take another 1sec and by this time writes * can dirty another 2GB, so inline write limit will hit. * * This means, * till [0-1GB] dirty data, we don't trigger any flushing/committing and * of course we don't hold application writes for inline writes. * The first write to hit 1GB dirty data, triggers flushing of the entire * 1GB extent through multiple block sized flushes. The write itself, * and further writes are not blocked. * When the 1GB extent completes flushing, the extent is marked commit * pending and if another 1GB dirty extent is accumulated we start * flushing that extent. If the next 1GB dirty extent is not yet ready, * we do nothing. When the 1GB dirty extent is ready we initiate * flushing and when this completes flushing, we have 2GB commit pending * and hence commit is triggered. No writes are blocked till now, and * they can dirty 2 more 1GB extents. * Commit typically takes around 50ms to complete. For RAM write speed * of say 5GB/s, this would mean application can dirty 250MB more till * the commit completes. * More application writes will cause more dirty data. If the commit * completes, then we have more free space and we don't need writes to * wait inline, else we hit the inline limit and writes are blocked. */ static const uint64_t max_dirty_allowed_per_file = max_dirty_extent_bytes() * 4; const bool local_pressure = (bytes_dirty + bytes_commit_pending) > max_dirty_allowed_per_file; /* * TODO: Add counter/stats for counting how many times we forced * inline write due to local and how many times due to global * reasons. */ if (local_pressure) { INC_GBL_STATS(inline_writes_lp, 1); return true; } /* * Global pressure is when get_prune_goals() returns non-zero bytes * to be pruned inline. */ uint64_t inline_bytes; get_prune_goals(&inline_bytes, nullptr); const bool global_pressure = (inline_bytes > 0); if (global_pressure) { INC_GBL_STATS(inline_writes_gp, 1); return true; } return false; } /** * get_prune_goals() looks at the following information and returns prune * goals for this cache: * - Total memory consumed by all caches. * - aznfsc_cfg.cache_max_mb (maximum total cache size allowed). * - Memory consumed by this particular cache. * * It returns two types of prune goals: * - Inline. * This tells how much memory to free inline. * This will be non-zero only under extreme memory pressure where we * cannot let writers continue w/o making space. * - Periodic. * This tells how much memory to free by the periodic sync thread. * In most common cases this is how memory will be reclaimed. */ void get_prune_goals(uint64_t *inline_bytes, uint64_t *periodic_bytes) const { // Maximum cache size allowed in bytes. static const uint64_t max_total = (aznfsc_cfg.cache.data.user.max_size_mb * 1024 * 1024ULL); assert(max_total != 0); /* * If cache usage grows to 90% of max, we enforce inline pruning for * writers. When cache usage grows more than 70% we recommend periodic * pruning. If the cache size is sufficient, hopefully we will not need * inline pruning too often, as it hurts application write performance. * Once curr_bytes_total exceeds inline_threshold we need to perform * inline pruning. We prune all the way upto inline_target to avoid * hysteresis. Similarly for periodic pruning we prune all the way * upto periodic_target. * * Following also means that at any time, half of the cache_max_mb * can be safely present in the cache. */ static const uint64_t inline_threshold = (max_total * 0.9); static const uint64_t inline_target = (max_total * 0.8); static const uint64_t periodic_threshold = (max_total * 0.7); static const uint64_t periodic_target = (max_total * 0.6); /* * Current total cache size in bytes. Save it once to avoid issues * with bytes_allocated* changing midway in these calculations. */ const uint64_t curr_bytes_total = bytes_allocated_g; const uint64_t curr_bytes = bytes_allocated; if (inline_bytes) { *inline_bytes = 0; } if (periodic_bytes) { *periodic_bytes = 0; } /* * If current cache usage is more than the inline_threshold limit, we * need to recommend inline pruning. We calculate how much %age of * total caches we need to prune and then divide it proportionately * among various caches (bigger caches need to prune more). We prune * upto inline_target; */ if (inline_bytes && (curr_bytes_total > inline_threshold)) { assert(inline_threshold > inline_target); // How much to prune? const uint64_t total_inline_goal = (curr_bytes_total - inline_target); const double percent_inline_goal = (total_inline_goal * 100.0 ) / curr_bytes_total; *inline_bytes = (curr_bytes * percent_inline_goal) / 100; // Prune at least 1MB. if (*inline_bytes < 1048576) { *inline_bytes = 1048576; } } if (periodic_bytes && (curr_bytes_total > periodic_threshold)) { assert(periodic_threshold > periodic_target); const uint64_t total_periodic_goal = (curr_bytes_total - periodic_target); const double percent_periodic_goal = (total_periodic_goal * 100.0 ) / curr_bytes_total; *periodic_bytes = (curr_bytes * percent_periodic_goal) / 100; if (*periodic_bytes < 1048576) { *periodic_bytes = 1048576; } } } /** * Check and perform inline pruning if needed. * We do inline pruning when we are "extremely" high on memory usage and * hence cannot proceed w/o making space for this new request. This must be * called from get() which may need more memory. * * TODO: Also add periodic pruning support. */ void inline_prune(); /** * This will run self tests to test the correctness of this class. */ static int unit_test(); /* * Stats for this cache. * These have been made public for easy access, w/o needing whole bunch * of accessor methods. Don't update them from outside! * * bytes_allocated is the total number of memmory bytes allocated for all * the bytes_chunk in this cache. Note that all of that memory may not be * used for cacheing and bytes_cached is the total bytes actually used for * cacheing. Following are the cases where allocated would be larger than * used: * - release() may release parts of the cache, though membuf cannot be * freed till the entire membuf is unused. * - For file-backed cache we have to mmap() on a 4k granularity but the * actual bytes_chunk may not be 4k granular. * * bytes_cached tracks the total number of bytes cached, not necessarily * in memory. For file-backed cache, bytes_cached may refer to memory bytes * or file bytes. Note that bytes_cached is not reduced when membuf is * drop()ped. This is because the data is still cached, albeit in the * backing file. */ std::atomic<uint64_t> num_chunks = 0; std::atomic<uint64_t> num_get = 0; std::atomic<uint64_t> bytes_get = 0; std::atomic<uint64_t> num_release = 0; std::atomic<uint64_t> bytes_release = 0; std::atomic<uint64_t> num_truncate = 0; std::atomic<uint64_t> bytes_truncate = 0; std::atomic<uint64_t> bytes_allocated = 0; std::atomic<uint64_t> bytes_cached = 0; std::atomic<uint64_t> bytes_dirty = 0; std::atomic<uint64_t> bytes_flushing = 0; std::atomic<uint64_t> bytes_commit_pending = 0; std::atomic<uint64_t> bytes_uptodate = 0; std::atomic<uint64_t> bytes_inuse = 0; std::atomic<uint64_t> bytes_locked = 0; /* * Global stats for all caches. */ static std::atomic<uint64_t> num_chunks_g; static std::atomic<uint64_t> num_get_g; static std::atomic<uint64_t> bytes_get_g; static std::atomic<uint64_t> num_release_g; static std::atomic<uint64_t> bytes_release_g; static std::atomic<uint64_t> num_truncate_g; static std::atomic<uint64_t> bytes_truncate_g; static std::atomic<uint64_t> bytes_allocated_g; static std::atomic<uint64_t> bytes_cached_g; static std::atomic<uint64_t> bytes_dirty_g; static std::atomic<uint64_t> bytes_flushing_g; static std::atomic<uint64_t> bytes_commit_pending_g; static std::atomic<uint64_t> bytes_uptodate_g; static std::atomic<uint64_t> bytes_inuse_g; static std::atomic<uint64_t> bytes_locked_g; // How many times set_locked() was called; static std::atomic<uint64_t> num_locked_g; // How many times set_locked() had to wait. static std::atomic<uint64_t> num_lockwait_g; // How much cumulative usecs those calls had to wait. static std::atomic<uint64_t> lock_wait_usecs_g; static uint64_t get_num_caches() { return num_caches; } struct nfs_inode *get_inode() const { return inode; } /** * Cache size is defined as 1+ offset of the last uptodate byte. * * Note: It can change anytime after this, so use it only as an estimate * which can be stale. */ uint64_t get_cache_size() const { assert(cache_size <= AZNFSC_MAX_FILE_SIZE); /* * bytes_uptodate is a count while cache_size is an offset. * set_uptodate() updates cache_size before bytes_uptodate so we can * safely assert for this. Also truncate() reduces cache_size after * reducing bytes_uptodate. * * Note: If read races with truncate we can have truncate() release * the membuf(s) but its destructor may not be called hence * bytes_uptodate may be reduced after cache_size causing this * assert to fail, but that should be rare so leave the assert. * The 2nd part of the assert is to protect against the above. * bytes_cached is decremented in truncate() when we remove a * chunk from the chunkmap, but bytes_uptodate is reduced only * whem membuf destructor is called which happens only later * in this case. bytes_uptodate can usually not be greater than * bytes_cached but in this case it can be and hence that covers * the exception case. * * XXX If ~membuf() is called between the two checks, then the * assert will fail, but that's very unlikely and assert is * o/w valuable. */ assert((cache_size >= bytes_uptodate) || (bytes_uptodate > bytes_cached)); return cache_size; } private: /** * Scan all chunks lying in the range [offset, offset+length) and perform * requested action, as described below: * * SCAN_ACTION_GET -> Return list of chunks covering the requested * range, allocating non-existent chunks and adding * to chunkmap. If extent_left/extent_right are non * null, they contain the left and right edge of the * contiguous extent that contains [offset, offset+length). * SCAN_ACTION_RELEASE -> Free chunks contained in the requested range. * * bytes_released will be set to the number of bytes actually released, * i.e., either entire chunk was released (and membuf freed) or the chunk * was trimmed. */ std::vector<bytes_chunk> scan(uint64_t offset, uint64_t length, scan_action action, uint64_t *bytes_released = nullptr, uint64_t *extent_left = nullptr, uint64_t *extent_right = nullptr); /** * This must be called with bytes_chunk_cache lock held. */ bool extend_backing_file(uint64_t newlen) { // No-op for non file-backed caches. if (backing_file_fd == -1) { assert(backing_file_len == 0); return true; } assert(newlen > 0); assert(newlen <= AZNFSC_MAX_FILE_SIZE); assert(backing_file_fd > 0); if (backing_file_len < newlen) { const int ret = ::ftruncate(backing_file_fd, newlen); if (ret != 0) { AZLogError("ftruncate(fd={}, length={}) failed: {}", backing_file_fd, newlen, strerror(errno)); assert(0); return false; } backing_file_len = newlen; } return true; } /* * std::map of bytes_chunk, indexed by the starting offset of the chunk. */ std::map<uint64_t, struct bytes_chunk> chunkmap; // Lock to protect chunkmap. mutable std::mutex chunkmap_lock_43; /* * Size of the cache. * This is 1+ offset of the last uptodate byte seen by this cache. * Increased in set_uptodate() and decreased *only* in truncate() iff * truncate shrinks the file size. Note that this reflects the maximum * cache size that we had till now and not necessarily what the current * cache holds. IOW, release()/clear()/inline_prune() may release one or * more chunks thus removing the actual cache contents, but it doesn't * reduce the cache_size. * * Note: It should actually be called cached_filesize. */ std::atomic<uint64_t> cache_size = 0; /* * File whose data we are cacheing. * Note that we don't hold a ref on this inode so it's only safe to use * from inline_prune() where we know inode is active. * * XXX If you use it from some other place either make sure inode is * safe to use from there or hold a ref on the inode. */ struct nfs_inode *const inode; std::string backing_file_name; int backing_file_fd = -1; std::atomic<uint64_t> backing_file_len = 0; /* * Flag to quickly mark the cache as invalid w/o purging the entire * cache. Once invalidate_pending is set, next cache lookup will first * purge the cache before proceeding. */ std::atomic<bool> invalidate_pending = false; // Count of total active caches. static std::atomic<uint64_t> num_caches; }; } #endif /* __AZNFSC_FILE_CACHE_H__ */

turbonfs/inc/file_cache.h (524 lines of code) (raw):