turbonfs/inc/nfs_inode.h (652 lines of code) (raw):
#ifndef __NFS_INODE_H__
#define __NFS_INODE_H__
#include <atomic>
#include <chrono>
#include "aznfsc.h"
#include "rpc_readdir.h"
#include "file_cache.h"
#include "readahead.h"
#include "fcsm.h"
#define NFS_INODE_MAGIC *((const uint32_t *)"NFSI")
// Compare two nfs_fh3 filehandles.
#define FH_EQUAL(fh1, fh2) \
(((fh1)->data.data_len == (fh2)->data.data_len) && \
(!::memcmp((fh1)->data.data_val, \
(fh2)->data.data_val, \
(fh1)->data.data_len)))
#define FH_VALID(fh) \
(((fh)->data.data_len > 0) && ((fh)->data.data_val != nullptr))
/**
* C++ object to hold struct nfs_fh3 from libnfs.
*/
struct nfs_fh3_deep
{
nfs_fh3_deep(const struct nfs_fh3& _fh)
{
#ifndef ENABLE_NON_AZURE_NFS
// Blob NFS FH is at least 50 bytes.
assert(_fh.data.data_len > 50 && _fh.data.data_len <= 64);
#else
assert(_fh.data.data_len <= 64);
#endif
fh.data.data_len = _fh.data.data_len;
fh.data.data_val = &fh_data[0];
::memcpy(fh.data.data_val, _fh.data.data_val, fh.data.data_len);
}
/**
* Return the libnfs nfs_fh3 object ref.
*/
const struct nfs_fh3& get_fh() const
{
assert(FH_VALID(&fh));
return fh;
}
private:
struct nfs_fh3 fh;
char fh_data[64];
};
/**
* Properties common to the entire filesystem.
*/
struct nfs_superblock
{
mutable std::shared_mutex sb_lock;
/*
* Blocksize and other filesystem properties.
*/
struct statvfs st;
/*
* Preferred readdir size (for directory enumeration).
*/
uint64_t dtpref;
uint64_t get_blocksize() const
{
assert(st.f_bsize >= 4096);
return st.f_bsize;
}
uint64_t get_dtpref() const
{
assert(dtpref >= 4096);
return dtpref;
}
};
/**
* This is the NFS inode structure. There is one of these per file/directory
* and contains any global information about the file/directory., f.e.,
* - NFS filehandle for accessing the file/directory.
* - FUSE inode number of the file/directory.
* - File/Readahead cache (if any).
* - Anything else that we want to maintain per file.
*/
struct nfs_inode
{
/*
* As we typecast back-n-forth between the fuse inode number and our
* nfs_inode structure, we use the magic number to confirm that we
* have the correct pointer.
*/
const uint32_t magic = NFS_INODE_MAGIC;
/*
* Filesystem properties, common to all inodes.
*/
static struct nfs_superblock sb;
/*
* Inode lock.
* Inode must be updated only with this lock held.
* VFS can make multiple calls (not writes) to the same file in parallel.
*/
mutable std::shared_mutex ilock_1;
/*
* Note on inode flush locking.
* is_flushing atomic boolean (aided by iflush_lock_3 and flush_cv) is used
* for synchronizing changes to backend file size as a result of
* flush/commit along with application initiated truncate calls forcing a
* specific file size. Note that the inode lock ilock_1 is for synchronizing
* the application visible state of the inode (attr cache, etc), while inode
* flush locking is for synchronizing changes to the on-disk file (through
* flush, commit and truncate).
* Any flush done to an inode will mark all the to-be-flushed membufs as
* flushing while holding this lock and any truncate call will hold this
* lock to ensure no new flush/commit operations are started while it
* updates the file size using SETATTR RPC.
*
* See flush_lock()/flush_unlock() for the actual locking.
*
* Note: Though it's called flush lock, but it protects backend file size
* changes through both flush and/or commit.
*/
mutable std::atomic<bool> is_flushing = false;
mutable std::condition_variable_any flush_cv;
mutable std::mutex iflush_lock_3;
/*
* S_IFREG, S_IFDIR, etc.
* 0 is not a valid file type.
*/
const uint32_t file_type = 0;
/*
* Ref count of this inode.
* Fuse expects that whenever we make one of the following calls, we
* must increment the lookupcnt of the inode:
* - fuse_reply_entry()
* - fuse_reply_create()
* - Lookup count of every entry returned by readdirplus(), except "."
* and "..", is incremented by one. Note that readdir() does not
* affect the lookup count of any of the entries returned.
*
* Since an nfs_inode is created only in response to one of the above,
* we set the lookupcnt to 1 when the nfs_inode is created. Later if
* we are not able to successfully convey creation of the inode to fuse
* we drop the ref. This is important as unless fuse knows about an
* inode it'll never call forget() for it and we will leak the inode.
* forget() causes lookupcnt for an inode to be reduced by the "nlookup"
* parameter count. forget_multi() does the same for multiple inodes in
* a single call.
* On umount the lookupcnt for all inodes implicitly drops to zero, and
* fuse may not call forget() for the affected inodes.
*
* Till the lookupcnt of an inode drops to zero, we MUST not free the
* nfs_inode structure, as kernel may send requests for files with
* non-zero lookupcnt, even after calls to unlink(), rmdir() or rename().
*
* dircachecnt is another refcnt which is the number of readdirplus
* directory_entry,s that refer to this nfs_inode. An inode can only be
* deleted when both lookupcnt and dircachecnt become 0, i.e., fuse
* vfs does not have a reference to the inode and it's not cached in
* any of our readdirectory_cache,s.
*
* See comment above inode_map.
*
* See comment above forget_expected.
*/
mutable std::atomic<uint64_t> lookupcnt = 0;
mutable std::atomic<uint64_t> dircachecnt = 0;
/*
* How many open fds for this file are currently present in fuse.
* Incremented when fuse calls open()/creat().
*/
std::atomic<uint64_t> opencnt = 0;
/*
* Silly rename related info.
* If this inode has been successfully silly renamed, is_silly_renamed will
* be set and silly_renamed_name will contain the silly renamed name and
* parent_ino is the parent directory ino. These will be needed for
* deleting ths silly renamed file once the last handle on the file is
* closed by user.
* silly_rename_level helps to get unique names in case the silly renamed
* file itself is deleted.
*/
bool is_silly_renamed = false;
std::string silly_renamed_name;
fuse_ino_t parent_ino = 0;
int silly_rename_level = 0;
private:
/*
* NFSv3 filehandle returned by the server.
* We use this to identify this file/directory to the server.
*/
const nfs_fh3_deep fh;
/*
* CRC32 hash of fh.
* This serves multiple purposes, most importantly it can be used to print
* filehandle hashes in a way that can be used to match with wireshark.
* Also used for affining writes to a file to one RPC transport.
*/
const uint32_t crc = 0;
/*
* This is a handle to the chunk cache which caches data for this file.
* Valid only for regular files.
* filecache_handle starts null in the nfs_inode constructor and is later
* initialized only in on_fuse_open() (when we return the inode to fuse in
* a lookup response or the application calls open()/creat()). The idea is
* to allocate the cache only when really needed. For inodes returned to
* fuse in a readdirplus response we don't initialize the filecache_handle.
* Once initialized we never make it null again, though we can make the
* cache itself empty by invalidate_cache(). So if has_filecache() returns
* true we can safely access the filecache_handle shared_ptr returned by
* get_filecache().
* alloc_filecache() initializes filecache_handle and sets filecache_alloced
* to true.
* Access to this shared_ptr must be protect by ilock_1, whereas access to
* the bytes_chunk_cache itself must be protected by chunkmap_lock_43.
*/
std::shared_ptr<bytes_chunk_cache> filecache_handle;
std::atomic<bool> filecache_alloced = false;
/*
* Pointer to the readdirectory cache.
* Only valid for a directory, this will be nullptr for a non-directory.
* Access to this shared_ptr must be protect by ilock_1, whereas access to
* the readdirectory_cache itself must be protected by readdircache_lock_2.
* Also see comments above filecache_handle.
*/
std::shared_ptr<readdirectory_cache> dircache_handle;
std::atomic<bool> dircache_alloced = false;
/*
* For maintaining readahead state.
* Valid only for regular files.
* Access to this shared_ptr must be protect by ilock_1, whereas access to
* the ra_state itself must be protected by ra_lock_40.
* Also see comments above filecache_handle.
*/
std::shared_ptr<ra_state> readahead_state;
std::atomic<bool> rastate_alloced = false;
/*
* Flush-commit state machine, used for performing flush/commit to the
* backend file.
* Valid only for regular files.
* Also see comments above filecache_handle.
*/
std::shared_ptr<struct fcsm> fcsm;
std::atomic<bool> fcsm_alloced = false;
/*
* Cached attributes for this inode.
* These cached attributes are valid till the absolute milliseconds value
* attr_timeout_timestamp. On expiry of this we will revalidate the inode
* by querying the attributes from the server. If the revalidation is
* successful (i.e., inode has not changed since we cached), then we
* increase attr_timeout_secs in an exponential fashion (upto the max
* actimeout value) and set attr_timeout_timestamp accordingly.
*
* If attr_timeout_secs is -1 that implies that cached attributes are
* not valid and we need to fetch the attributes from the server. This
* should never happen as we set attr in the nfs_inode constructor and
* from then on it's always set.
*
* See update_nolock() how these attributes are compared with freshly
* fetched preop or postop attributes to see if file/dir has changed
* (and thus the cache must be invalidated).
*
* Note: This MUST be accessed under ilock_1.
*
* Note: External users can access it using the get_attr() method which
* correctly accesses it under ilock_1.
* Callers already holding ilock_1 must use the nolock version
* get_attr_nolock().
*/
struct stat attr;
/**
* We maintain following multiple views of the file and thus multiple file
* sizes for those views.
* - Cached.
* This is the view of the file that comprises of data that has been
* written by the application and saved in file cache. It may or may not
* have been flushed and/or committed. This is the most uptodate view of
* the file and applications must use this view.
* get_cached_filesize() returns the cached file size.
* - Uncommited.
* This is the view of the file that tracks data that has been flushed
* using UNSTABLE writes but not yet COMMITted to the Blob. This view of
* the file is only used to see if the next PB call will write after the
* last PB'ed byte and thus can be appended.
* putblock_filesize tracks the file size for this view.
* - Committed.
* This is the view of the file that tracks data committed to the Blob.
* Other clients will see this view.
* attr.st_size tracks the file size for this view.
*/
off_t putblock_filesize = 0;
/*
* For any file stable_write starts as false as write pattern is unknown.
* At the time of flushing cached writes to Blob we check if the given
* write causes an append write on the Blob, or an overwrite or sparse
* write. Append writes can be sent as unstable write, while non-append
* writes (either overwrite or sparse write) must go as a stable write
* (since server knows best how to allocate blocks for them).
* Once set to true, it remains true for the life of the inode.
*
* TODO: Set this to false once we have servers with unstable write
* support. Also uncomment the assert in nfs_inode constructor.
*/
bool stable_write = true;
/*
* XXX This is for debugging.
* It's set in truncate_start() and cleared in truncate_end().
*/
std::atomic<bool> truncate_in_progress = false;
public:
/*
* Fuse inode number.
* This is how fuse identifies this file/directory to us.
* Fuse expects us to ensure that if we reuse ino we must ensure that the
* ino/generation pair is unique for the life of the fuse filesystem (and
* not just unique for one mount). This is specially useful if this fuse
* filesystem is exported over NFS. Since NFS would issue filehandles
* based on the ino number and generation pair, if ino number and generation
* pair is not unique NFS server might issue the same FH to two different
* files if "fuse driver + NFS server" is restarted. To avoid that make
* sure generation id is unique. We use the current epoch in usecs to
* ensure uniqueness. Note that even if the time goes back, it's highly
* unlikely that we use the same ino number and usec combination, but
* it's technically possible.
*
* IMPORTANT: Need to ensure time is sync'ed and it doesn't go back.
*/
const fuse_ino_t ino;
const uint64_t generation;
/*
* attr_timeout_secs will have a value between [acregmin, acregmax] or
* [acdirmin, acdirmax], depending on the filetype, and holds the current
* attribute cache timeout value for this inode, adjusted by exponential
* backoff and capped by the max limit.
* attr_timeout_timestamp is the absolute time in msecs when the attribute
* cache is going to expire.
*
* attr_timeout_secs is protected by ilock_1.
* attr_timeout_timestamp is updated inder ilock_1, but can be accessed
* w/o ilock_1, f.e., run_getattr()->attr_cache_expired().
*/
std::atomic<int64_t> attr_timeout_secs = -1;
std::atomic<int64_t> attr_timeout_timestamp = -1;
/*
* Time in usecs we received the last cached write for this inode.
* See discussion in stamp_cached_write() for details.
*/
std::atomic<int64_t> last_cached_write = 0;
// nfs_client owning this inode.
struct nfs_client *const client;
/*
* How many forget count we expect from fuse.
* It'll be incremented whenever we are able to successfully call one of
* the following:
* - fuse_reply_create()
* - fuse_reply_entry()
* - fuse_reply_buf() (for readdirplus and not for readdir)
*
* Fuse must call exactly these many forgets on this inode and the inode
* can only be freed when forget_expected becomes 0. Fuse must not call
* more forgets than forget_expected.
*
* Note: forget_expected may become 0 indicating that fuse doesn't know
* about this inode but inode may still be in use (lookupcnt or
* dircachecnt can be non-zero), then we don't free the inode.
*
* We use this for forgetting all inodes on unmount, and also for
* debugging to see if fuse forgets to call forget :-)
*
* Note: In nfs_inode::decref() we assert that lookupcnt is always
* greater than or equal to forget_expected, hence wherever we
* increment both we must increment forget_expected after lookupcnt
* and v.v. we must decrement forget_expected before lookupcnt.
*/
std::atomic<int64_t> forget_expected = 0;
#ifdef ENABLE_PARANOID
uint64_t last_forget_seen_usecs = 0;
#endif
/*
* Stores the write error observed when performing backend writes to this
* Blob. This helps us duly fail close(), if one or more IOs have failed
* for the Blob. Note that the application read may complete immediately
* after copying the data to the cache but later when sync'ing dirty
* membufs with the Blob we might encounter write failures. These failures
* MUST be conveyed to the application via close(), else it'll never know.
*
* This is either 0 (no error) or a +ve errno value.
*/
int write_error = 0;
/*
* Commit state for this inode.
* This is used to track the state of commit operation for this inode, which
* can be one of:
* COMMIT_NOT_NEEDED: No or not enough uncommitted (written using unstable
* writes) data.
* Note that we want to commit multiple blocks at a time
* to amortize the latency introduced by commit, given the
* fact that all writes have to stop till the commit
* completes.
* NEEDS_COMMIT: There's enough uncommitted data that needs to be
* committed.
* This indicates to the running write(flush) task that
* it must start the commit task when ongoing flushing
* completes (bytes_flushing == 0).
* COMMIT_IN_PROGRESS: There's an outstanding commit operation.
* Till it completes no write or commit for this inode
* can be sent to the server.
*
* Valid state transitions:
* COMMIT_NOT_NEEDED -> NEEDS_COMMIT -> COMMIT_IN_PROGRESS
* COMMIT_NOT_NEEDED -> COMMIT_IN_PROGRESS
* COMMIT_IN_PROGRESS -> COMMIT_NOT_NEEDED
*/
enum class commit_state_t
{
INVALID = 0,
COMMIT_NOT_NEEDED,
NEEDS_COMMIT,
COMMIT_IN_PROGRESS,
};
std::atomic<commit_state_t> commit_state = commit_state_t::COMMIT_NOT_NEEDED;
/**
* TODO: Initialize attr with postop attributes received in the RPC
* response.
*/
nfs_inode(const struct nfs_fh3 *filehandle,
const struct fattr3 *fattr,
struct nfs_client *_client,
uint32_t _file_type,
fuse_ino_t _ino = 0);
~nfs_inode();
/**
* Does this nfs_inode have cache allocated?
* It correctly checks cache for both directory and file inodes and it
* only checks if the cache is allocated and not whether cache has some
* data.
*
* Note: Only files/directories which are open()ed will have cache
* allocated, also since directory cache doubles as DNLC, for
* directories if at least one file/subdir inside this directory is
* looked up by fuse, the cache will be allocated.
*
* LOCKS: None.
*/
bool has_cache() const
{
if (is_dir()) {
return has_dircache();
} else if (is_regfile()) {
return has_filecache();
}
return false;
}
/**
* Is the inode cache (filecache_handle or dircache_handle) empty?
*
* Note: This returns the current inode cache status at the time of this
* call, it my change right after this function returns. Keep this
* in mind when using the result.
*
* LOCKS: None.
*/
bool is_cache_empty() const
{
if (is_regfile()) {
return !has_filecache() || filecache_handle->is_empty();
} else if (is_dir()) {
return !has_dircache() || dircache_handle->is_empty();
} else {
return true;
}
}
/**
* Allocate file cache if not already allocated.
* This must be called from code that returns an inode after a regular
* file is opened or created.
* It's a no-op if the filecache is already allocated.
*
* LOCKS: If not already allocated it'll take exclusive ilock_1.
*/
void alloc_filecache()
{
assert(is_regfile());
if (filecache_alloced) {
// Once allocated it cannot become null again.
assert(filecache_handle);
return;
}
std::unique_lock<std::shared_mutex> lock(ilock_1);
if (!filecache_handle) {
assert(!filecache_alloced);
if (aznfsc_cfg.filecache.enable && aznfsc_cfg.filecache.cachedir) {
const std::string backing_file_name =
std::string(aznfsc_cfg.filecache.cachedir) + "/" + std::to_string(get_fuse_ino());
filecache_handle =
std::make_shared<bytes_chunk_cache>(this, backing_file_name.c_str());
} else {
filecache_handle = std::make_shared<bytes_chunk_cache>(this);
}
filecache_alloced = true;
}
}
/**
* We split the truncate operation in two separate apis truncate_start()
* and truncate_end(). truncate_start() must be called before issuing the
* SETATTR RPC and truncate_end() must be called from SETATTR callback.
* truncate_start() grabs the flush_lock to ensure no new flush/commit
* operations can be issued for this inode, and waits for any ongoing
* flush/commit operations to complete, before truncating the filecache to
* the new size.
* truncate_end() calls bytes_chunk_cache::truncate(post=true) to finish
* cache truncate and calls flush_unlock() to release the flush_lock
* held by truncate_start().
* This two apis together ensure that any flush/commit operation cannot
* change the file size after truncate sets it.
*/
bool truncate_start(size_t size);
void truncate_end(size_t size);
/**
* This MUST be called only after has_filecache() returns true, else
* there's a possibility of data race, as the returned filecache_handle
* ref may be updated by alloc_filecache() right after get_filecache()
* returns and while the caller is accessing the shared_ptr.
* So f.e., calling "if (get_filecache())" to check presence of cache is
* not safe as get_filecache() is being used as a boolean here so it calls
* "shared_ptr::operator bool()" which returns true even while the
* shared_ptr is being initialized by alloc_filecache(), thus it causes
* a data race.
* Once filecache_handle is allocated by alloc_filecache() it remains set
* for the life of the inode, so we can safely use the shared_ptr w/o the
* inode lock.
*
* Note: This MUST be called only when has_filecache() returns true.
*
* LOCKS: None.
*/
std::shared_ptr<bytes_chunk_cache>& get_filecache()
{
assert(is_regfile());
assert(filecache_alloced);
assert(filecache_handle);
return filecache_handle;
}
const std::shared_ptr<bytes_chunk_cache>& get_filecache() const
{
assert(is_regfile());
assert(filecache_alloced);
assert(filecache_handle);
return filecache_handle;
}
/**
* External users of this nfs_inode can check for presence of filecache by
* calling has_filecache().
*
* LOCKS: None.
*/
bool has_filecache() const
{
assert(is_regfile());
assert(!filecache_alloced || filecache_handle);
return filecache_alloced;
}
/**
* Allocate directory cache if not already allocated.
* This must be called from code that returns an inode after a directory
* is opened or created.
* It's a no-op if the dircache is already allocated.
*
* LOCKS: If not already allocated it'll take exclusive ilock_1.
*/
void alloc_dircache(bool newly_created_directory = false)
{
assert(is_dir());
if (dircache_alloced) {
// Once allocated it cannot become null again.
assert(dircache_handle);
return;
}
std::unique_lock<std::shared_mutex> lock(ilock_1);
if (!dircache_handle) {
assert(!dircache_alloced);
dircache_handle = std::make_shared<readdirectory_cache>(client, this);
/*
* If this directory is just created, mark it as "confirmed".
*/
if (newly_created_directory) {
dircache_handle->set_confirmed();
}
dircache_alloced = true;
}
}
/**
* This MUST be called only after has_dircache() returns true.
* See comment above get_filecache().
*
* Note: This MUST be called only when has_dircache() returns true.
*
* LOCKS: None.
*/
std::shared_ptr<readdirectory_cache>& get_dircache()
{
assert(is_dir());
assert(dircache_alloced);
assert(dircache_handle);
return dircache_handle;
}
const std::shared_ptr<readdirectory_cache>& get_dircache() const
{
assert(is_dir());
assert(dircache_alloced);
assert(dircache_handle);
return dircache_handle;
}
/**
* External users of this nfs_inode can check for presence of dircache by
* calling has_dircache().
*
* LOCKS: None.
*/
bool has_dircache() const
{
assert(is_dir());
assert(!dircache_alloced || dircache_handle);
return dircache_alloced;
}
/**
* Allocate readahead_state if not already allocated.
* This must be called from code that returns an inode after a file
* is opened or created.
* It's a no-op if the rastate is already allocated.
*
* LOCKS: If not already allocated it'll take exclusive ilock_1.
*/
void alloc_rastate()
{
assert(is_regfile());
if (rastate_alloced) {
// Once allocated it cannot become null again.
assert(readahead_state);
return;
}
std::unique_lock<std::shared_mutex> lock(ilock_1);
/*
* readahead_state MUST only be created if filecache_handle is set.
*/
assert(filecache_handle);
if (!readahead_state) {
assert(!rastate_alloced);
readahead_state = std::make_shared<ra_state>(client, this);
rastate_alloced = true;
}
}
/**
* This MUST be called only after has_rastate() returns true.
* See comment above get_filecache().
*
* Note: This MUST be called only when has_rastate() returns true.
*
* LOCKS: None.
*/
const std::shared_ptr<ra_state>& get_rastate() const
{
assert(is_regfile());
assert(rastate_alloced);
assert(readahead_state);
return readahead_state;
}
std::shared_ptr<ra_state>& get_rastate()
{
assert(is_regfile());
assert(rastate_alloced);
assert(readahead_state);
return readahead_state;
}
/**
* External users of this nfs_inode can check for presence of readahead
* state by calling has_rastate().
*
* LOCKS: None.
*/
bool has_rastate() const
{
assert(is_regfile());
assert(!rastate_alloced || readahead_state);
return rastate_alloced;
}
/**
* Allocate fcsm (flush-commit state machine) if not already allocated.
* This must be called from code that returns an inode after a file
* is opened or created.
* It's a no-op if the fcsm is already allocated.
*
* LOCKS: If not already allocated it'll take exclusive ilock_1.
*/
void alloc_fcsm()
{
assert(is_regfile());
if (fcsm_alloced) {
// Once allocated it cannot become null again.
assert(fcsm);
return;
}
std::unique_lock<std::shared_mutex> lock(ilock_1);
/*
* fcsm MUST only be created if filecache_handle is set.
*/
assert(filecache_handle);
if (!fcsm) {
assert(!fcsm_alloced);
fcsm = std::make_shared<struct fcsm>(client, this);
fcsm_alloced = true;
}
}
/**
* This MUST be called only after has_fcsm() returns true.
* See comment above get_filecache().
*
* Note: This MUST be called only when has_fcsm() returns true.
*
* LOCKS: None.
*/
const std::shared_ptr<struct fcsm>& get_fcsm() const
{
assert(is_regfile());
assert(fcsm_alloced);
assert(fcsm);
return fcsm;
}
std::shared_ptr<struct fcsm>& get_fcsm()
{
assert(is_regfile());
assert(fcsm_alloced);
assert(fcsm);
return fcsm;
}
/**
* External users of this nfs_inode can check for presence of fcsm
* by calling has_fcsm().
*
* LOCKS: None.
*/
bool has_fcsm() const
{
assert(is_regfile());
assert(!fcsm_alloced || fcsm);
return fcsm_alloced;
}
/**
* This must be called from all paths where we respond to a fuse request
* that amounts to open()ing a file/directory. Once a file/directory is
* open()ed, application can call all the POSIX APIs that take an fd, so if
* we defer anything in the nfs_inode constructor (as we are not sure if
* application will call any POSIX API on the file) perform the allocation
* here.
*
* LOCKS: Exclusive ilock_1.
*/
void on_fuse_open(enum fuse_opcode optype)
{
/*
* Only these fuse ops correspond to open()/creat() which return an
* fd.
*/
assert((optype == FUSE_CREATE) ||
(optype == FUSE_OPEN) ||
(optype == FUSE_OPENDIR));
opencnt++;
AZLogDebug("[{}:{}] on_fuse_open({}), new opencnt is {}",
get_filetype_coding(), ino, (int) optype, opencnt.load());
if (is_regfile()) {
/*
* Allocate filecache_handle before readahead_state and fcsm as we
* assert for filecache_handle in alloc_rastate() and alloc_fcsm().
*/
alloc_filecache();
alloc_rastate();
alloc_fcsm();
} else if (is_dir()) {
alloc_dircache();
}
}
/**
* This must be called from all paths where we respond to a fuse request
* that makes fuse aware of this inode. It could be lookup or readdirplus.
* Once fuse receives an inode it can call operations like lookup/getattr.
* See on_fuse_open() which is called by paths which not only return the inode
* but also an fd to the application, f.e. creat().
*
* LOCKS: Exclusive ilock_1.
*/
void on_fuse_lookup(enum fuse_opcode optype)
{
/*
* Only these fuse ops correspond to operations that return an inode
* to fuse, but don't cause a fd to be returned to the application.
* FUSE_READDIR and FUSE_READDIRPLUS are the only other ops that return
* inode to fuse but we don't call on_fuse_lookup() for those as they
* could be a lot and most commonly applications will not perform IO
* on all files returned by readdir/readdirplus.
*/
assert((optype == FUSE_LOOKUP) ||
(optype == FUSE_MKNOD) ||
(optype == FUSE_MKDIR) ||
(optype == FUSE_SYMLINK));
if (is_regfile()) {
assert(optype == FUSE_LOOKUP ||
optype == FUSE_MKNOD);
} else if (is_dir()) {
assert(optype == FUSE_LOOKUP ||
optype == FUSE_MKDIR);
/*
* We have a unified cache for readdir/readdirplus and lookup, so
* we need to create the readdir cache on lookup.
*/
alloc_dircache(optype == FUSE_MKDIR);
}
}
/**
* Return the fuse inode number for this inode.
*/
fuse_ino_t get_fuse_ino() const
{
assert(ino != 0);
return ino;
}
/**
* Return the generation number for this inode.
*/
uint64_t get_generation() const
{
assert(generation != 0);
return generation;
}
/**
* Get ref to the superblock structure.
* Caller must ensure that any access to the superblock structure is done
* while duly holding the sb_lock.
*/
static struct nfs_superblock& get_sb()
{
return sb;
}
static std::shared_mutex& get_sb_lock()
{
return sb.sb_lock;
}
/**
* Use this to safely fetch the inode attributes.
*
* LOCKS: Shared ilock_1.
*/
struct stat get_attr() const
{
/*
* Following inode lock will be released after attr is copied to the
* caller.
*/
std::shared_lock<std::shared_mutex> lock(ilock_1);
return attr;
}
/**
* Caller MUST hold shared ilock_1.
*/
const struct stat& get_attr_nolock() const
{
return attr;
}
/**
* Caller MUST hold exclusive ilock_1.
*/
struct stat& get_attr_nolock()
{
return attr;
}
/**
* Populate 'fattr' with this inode's attributes.
*/
void fattr3_from_stat(struct fattr3& fattr) const;
int get_silly_rename_level()
{
return silly_rename_level++;
}
/**
* Return the NFS fileid. This is also the inode number returned by
* stat(2).
* Caller MUST hold ilock_1.
*/
uint64_t get_fileid() const
{
assert(attr.st_ino != 0);
return attr.st_ino;
}
/**
* Marks the attribute cache as expired for the inode.
* Any call to attr_cache_expired() after this call MUST return true and
* hence caller MUST NOT try to use the saved attribute cache of this inode.
* Typically this is called when a file/dir is deleted and we don't want
* any subsequent getattr call to return attributes for deleted file/dir.
*/
void invalidate_attribute_cache()
{
// Set it to 0 to force attr_cache_expired() to always return true.
attr_timeout_timestamp = 0;
}
/**
* Checks whether inode->attr is expired as per the current actimeo.
*/
bool attr_cache_expired() const
{
/*
* This is set in the constructor as a newly created nfs_inode always
* has attributes cached in nfs_inode::attr.
*/
assert(attr_timeout_timestamp != -1);
const int64_t now_msecs = get_current_msecs();
const bool attr_expired = (attr_timeout_timestamp < now_msecs);
return attr_expired;
}
void set_truncate_in_progress()
{
assert(!truncate_in_progress);
truncate_in_progress = true;
}
void clear_truncate_in_progress()
{
assert(truncate_in_progress);
truncate_in_progress = false;
}
bool is_truncate_in_progress() const
{
return truncate_in_progress;
}
int64_t get_cached_filesize() const
{
assert(is_regfile());
assert(has_filecache());
const int64_t cached_filesize = get_filecache()->get_cache_size();
assert(cached_filesize >= 0);
assert(cached_filesize <= (off_t) AZNFSC_MAX_FILE_SIZE);
return cached_filesize;
}
/**
* Get the estimated file size on the server. Note that this is based on
* cached attributes hence the returned size is at best an estimate and may
* not exactly match the most recent file size on the server. Callers are
* warned about that and they should not use it for any hard failures that
* may be in violation of the protocol.
* If cached attributes have expired (as per the configured actimeo) then
* it returns -1 and caller must handle it, unless caller does not care
* and passed dont_check_expiry as true.
*
* Note: Use get_file_sizes() if you need both server and client file
* sizes.
*/
int64_t get_server_file_size(const bool dont_check_expiry = false) const
{
/*
* XXX We access attr.st_size w/o holding ilock_1 as aligned access
* to uint64_t should be safe, moreover we want to avoid the
* ilock_1 in the read fastpath.
*/
assert((size_t) attr.st_size <= AZNFSC_MAX_FILE_SIZE);
if (dont_check_expiry) {
return attr.st_size;
}
return attr_cache_expired() ? -1 : attr.st_size;
}
/**
* Get client's most recent estimate of the file size.
* Note that unlike get_server_file_size() which estimates the file size
* strictly as present on the server, this is a size estimate that matters
* from the client applications' pov. It considers the cached filesize
* also and returns the max of the server file size and cached filesize.
* Note that cached filesize corresponds to data which has not yet been
* synced with the server, so won't be reflected in the server file size,
* but reader applications would be interested in cached data too.
*
* Returns -1 to indicate that we do not have a good estimate of the file
* size. Since we always know the cached filesize for sure, this happens
* when we do not know the recent server file size (within the last
* attributes cache timeout period).
*
* Note: Use get_file_sizes() if you need both server and client file
* sizes.
*/
int64_t get_client_file_size() const
{
const int64_t sfsize = get_server_file_size();
if (sfsize == -1) {
/*
* We don't know server size, so we cannot estimate
* effective client file size for sure.
*/
return -1;
}
return std::max(sfsize, get_cached_filesize());
}
/**
* Get both server and client file sizes.
* Use this when you need to know both server and client file sizes
* atomically, i.e., it will either return -1 for both client and server
* file sizes or it'll return valid value for both.
*/
void get_file_sizes(int64_t& cfsize, int64_t& sfsize) const
{
sfsize = get_server_file_size();
if (sfsize == -1) {
cfsize = -1;
// We don't know either.
assert((cfsize == -1) && (sfsize == -1));
return;
}
cfsize = std::max(sfsize, get_cached_filesize());
// We know both.
assert((cfsize != -1) && (sfsize != -1));
}
/**
* This must be called from copy_to_cache() whenever we successfully copy
* some data to filecache.
*
* Note: It doesn't update attr.ctime and attr.mtime deliberately as this
* is not authoritative info and we would want to fetch attributes
* from server when needed.
*/
void on_cached_write(off_t offset, size_t length)
{
[[maybe_unused]]
const off_t new_size = offset + length;
[[maybe_unused]]
const off_t cached_filesize = (off_t) get_filecache()->get_cache_size();
/*
* on_cached_write() is called after set_uptodate() so cached_filesize
* must already have been updated.
*/
assert(cached_filesize >= new_size);
}
/**
* Check if [offset, offset+length) lies within the current RA window.
* bytes_chunk_cache would call this to find out if a particular membuf
* can be purged. Membufs in RA window would mostly be used soon and
* should not be purged.
* Note that it checks if there is any overlap and not whether it fits
* entirely within the RA window.
*
* LOCKS: None.
*/
bool in_ra_window(uint64_t offset, uint64_t length) const;
/**
* Is this file currently open()ed by any application.
*/
bool is_open() const
{
return opencnt > 0;
}
/**
* Return the nfs_inode corresponding to filename in the directory
* represented by this inode.
* It'll hold a lookupcnt ref on the returned inode and caller must drop
* that ref by calling decref().
*
* Note: Shared readdircache_lock_2.
*/
struct nfs_inode *dnlc_lookup(const char *filename,
bool *negative_confirmed = nullptr) const
{
assert(is_dir());
if (has_dircache()) {
struct nfs_inode *inode =
dircache_handle->dnlc_lookup(filename, negative_confirmed);
// dnlc_lookup() must have held a lookupcnt ref.
assert(!inode || inode->lookupcnt > 0);
return inode;
}
return nullptr;
}
/**
* Add DNLC entry "filename -> inode".
*/
void dnlc_add(const char *filename, struct nfs_inode *inode)
{
assert(filename);
assert(inode);
assert(inode->magic == NFS_INODE_MAGIC);
assert(is_dir());
/*
* Directory inodes returned by READDIRPLUS won't have dircache
* allocated, and fuse may call lookup on them, allocate dircache now
* before calling dnlc_add().
*/
alloc_dircache();
dircache_handle->dnlc_add(filename, inode);
}
/*
* Find nfs_inode for 'filename' in this directory.
* It first searches in dnlc and if not found there makes a sync LOOKUP
* call. If sync LOOKUP fails it returns nullptr and sets failure_status
* to a +ve errno value.
* This calls revalidate().
*/
struct nfs_inode *lookup(const char *filename,
int *failure_status = nullptr);
/**
* Note usecs when the last cached write was received for this inode.
* A cached write is not a direct application write but writes cached
* by fuse kernel driver and then dispatched later as possibly bigger
* writes. These have fi->writepage set.
* We use this to decide if we need to no-op a setattr(mtime) call.
* Note that fuse does not provide filesystems a way to convey "nocmtime",
* i.e. fuse should not call setattr(mtime) to set file mtime during
* cached write calls. Fuse will not call setattr(mtime) if we are not
* using kernel cache as it expects the filesystem to manage mtime itself,
* but if kernel cache is used fuse calls setattr(mtime) very often which
* slows down the writes. Since our backing filesystem is NFS it'll take
* care of updating mtime and hence we can ignore such setattr(mtime)
* calls. To distinguish setattr(mtime) done as a result of writes from
* ones that are done as a result of explicit utime() call by application,
* we check if we have seen cached write recently.
*/
void stamp_cached_write()
{
if (aznfsc_cfg.cache.data.kernel.enable) {
last_cached_write = get_current_usecs();
}
}
/**
* Should we skip setattr(mtime) call for this inode?
* See discussion above stamp_cached_write().
* new_mtime is the updated mtime that fuse wants to set.
* If we propose to skip mtime update, and inode's cached mtime is older
* than new_mtime, we refresh inode's cached mtime and ctime.
*
* LOCKS: Exclusive ilock_1.
*/
bool skip_mtime_update(const struct timespec& new_mtime)
{
// Caller must pass a valid mtime.
assert(new_mtime.tv_sec != 0);
static const int64_t one_sec = 1000 * 1000ULL;
const int64_t now_usecs = get_current_usecs();
const int64_t now_msecs = now_usecs / 1000ULL;
const bool attrs_valid = (attr_timeout_timestamp >= now_msecs);
/*
* Kernel can be sending multiple writes/setattr in parallel over
* multiple fuse threads, hence last_cached_write may be greater
* than now_usecs.
*/
const bool write_seen_recently =
((last_cached_write > now_usecs) ||
((now_usecs - last_cached_write) < one_sec));
/*
* We skip setattr(mtime) if we have seen a cached write in the last
* one sec and if we have valid cached attributes for this inode.
* Note that we need to return updated attributes in setattr response.
*/
const bool skip = (write_seen_recently && attrs_valid);
if (skip) {
std::unique_lock<std::shared_mutex> lock(ilock_1);
if (compare_timespec(new_mtime, attr.st_mtim) > 0) {
attr.st_mtim = new_mtime;
if (compare_timespec(new_mtime, attr.st_ctim) > 0) {
attr.st_ctim = new_mtime;
}
}
}
return skip;
}
/**
* Is commit pending for this inode?
*/
bool is_commit_pending() const
{
assert(commit_state != commit_state_t::INVALID);
return (commit_state == commit_state_t::NEEDS_COMMIT);
}
/**
* set needs_commit state for this inode.
* Note this is set to let flushing task know that commit is pending and start commit task.
*/
void set_commit_pending()
{
// Commit can be set to pending only if it is in commit_not_needed state.
assert(commit_state == commit_state_t::COMMIT_NOT_NEEDED);
commit_state = commit_state_t::NEEDS_COMMIT;
}
/**
* Is commit in progress for this inode?
*/
bool is_commit_in_progress() const
{
assert(commit_state != commit_state_t::INVALID);
return (commit_state == commit_state_t::COMMIT_IN_PROGRESS);
}
/**
* Set commit_in_progress state for this inode.
*/
void set_commit_in_progress()
{
assert(commit_state != commit_state_t::INVALID);
assert(commit_state != commit_state_t::COMMIT_IN_PROGRESS);
commit_state = commit_state_t::COMMIT_IN_PROGRESS;
}
/**
* Clear commit_in_progress state for this inode.
*/
void clear_commit_in_progress()
{
assert(commit_state == commit_state_t::COMMIT_IN_PROGRESS);
commit_state = commit_state_t::COMMIT_NOT_NEEDED;
}
/**
* Increment lookupcnt of the inode.
*/
void incref() const
{
lookupcnt++;
AZLogDebug("[{}] lookupcnt incremented to {} (dircachecnt: {}, "
"forget_expected: {})",
ino, lookupcnt.load(), dircachecnt.load(),
forget_expected.load());
}
/**
* Decrement lookupcnt of the inode and delete it if lookupcnt
* reaches 0.
* 'cnt' is the amount by which the lookupcnt must be decremented.
* This is usually the nlookup parameter passed by fuse FORGET, when
* decref() is called from fuse FORGET, else it's 1.
* 'from_forget' should be set to true when calling decref() for
* handling fuse FORGET. Note that fuse FORGET is special as it
* conveys important information about the inode. Since FORGET may
* mean that fuse VFS does not have any reference to the inode, we can
* use that to perform some imp tasks like, purging the readdir cache
* for directory inodes. This is imp as it makes the client behave
* like the kernel NFS client where flushing the cache causes the
* directory cache to be flushed, and this can be a useful technique
* in cases where NFS client is not being consistent with the server.
*/
void decref(size_t cnt = 1, bool from_forget = false);
/**
* Returns true if inode is FORGOTten by fuse.
* Forgotten inodes will not be referred by fuse in any api call.
* Note that forgotten inodes may still hang around if they are
* referenced by at least one directory_entry cache.
*/
bool is_forgotten() const
{
return (lookupcnt == 0);
}
/**
* Is this inode cached by any readdirectory_cache?
*/
bool is_dircached() const
{
return (dircachecnt > 0);
}
nfs_client *get_client() const
{
assert(client != nullptr);
return client;
}
const struct nfs_fh3& get_fh() const
{
return fh.get_fh();
}
uint32_t get_crc() const
{
return crc;
}
bool is_dir() const
{
return (file_type == S_IFDIR);
}
// Is regular file?
bool is_regfile() const
{
return (file_type == S_IFREG);
}
/**
* Short character code for file_type, useful for logs.
*/
char get_filetype_coding() const
{
#ifndef ENABLE_NON_AZURE_NFS
assert(file_type == S_IFDIR ||
file_type == S_IFREG ||
file_type == S_IFLNK);
#endif
return (file_type == S_IFDIR) ? 'D' :
((file_type == S_IFLNK) ? 'S' :
((file_type == S_IFREG) ? 'R' : 'U'));
}
/**
* Get the minimum attribute cache timeout value in seconds, to be used
* for this file.
*/
int get_actimeo_min() const;
/**
* Get the maximum attribute cache timeout value in seconds, to be used
* for this file.
*/
int get_actimeo_max() const;
/**
* Get current attribute cache timeout value (in secs) for this inode.
* Note that the attribute cache timeout moves between the min and max
* values returned by the above methods, depending on whether the last
* revalidation attempt was a success or not.
*/
int get_actimeo() const
{
// If not set, return the min configured value.
return (attr_timeout_secs != -1) ? attr_timeout_secs.load()
: get_actimeo_min();
}
/**
* Copy application data into the inode's file cache.
*
* bufv: fuse_bufvec containing application data, passed by fuse.
* offset: starting offset in file where the data should be written.
* extent_left: after this copy what's the left edge of the longest dirty
* extent containing this latest write.
* extent_right: after this copy what's the right edge of the longest dirty
* extent containing this latest write.
* Caller can use the extent length information to decide if it wants to
* dispatch an NFS write right now or wait and batch more, usually by
* comparing it with the wsize value.
*
* Returns 0 if copy was successful, else a +ve errno value indicating the
* error. This can be passed as-is to the rpc_task reply_error() method to
* convey the error to fuse.
* EAGAIN is the special error code that would mean that caller must retry
* the current copy_to_cache() call.
*
* Note: The membufs to which the data is copied will be marked dirty and
* uptodate once copy_to_cache() returns.
*/
int copy_to_cache(const struct fuse_bufvec* bufv,
off_t offset,
uint64_t *extent_left,
uint64_t *extent_right);
/**
* Flush the dirty file cache represented by filecache_handle and wait
* till all dirty data is sync'ed with the NFS server. Only dirty data
* in the given range is flushed if provided, else all dirty data is
* flushed.
* Note that filecache_handle is the only writeback cache that we have
* and hence this only flushes that.
* For a non-reg file inode this will be a no-op.
* Returns 0 on success and a positive errno value on error.
*
* Note: This doesn't take the inode lock but instead it would grab the
* filecache_handle lock and get the list of dirty membufs at this
* instant and flush those. Any new dirty membufs added after it
* queries the dirty membufs list, are not flushed.
*
* Note: This grabs the inode flush_lock to ensure that it doesn't
* initiate any new flush operations while some truncate call is in
* progress (which must have held the flush_lock).
*/
int flush_cache_and_wait();
/**
* Wait for currently flushing/committing membufs to complete.
* It will wait till the currently flushing membufs complete and then
* issue a commit and wait for that. If no flush is ongoing but there's
* commit_pending data, it'll commit that and return after the commit
* completes.
* Returns 0 on success and a positive errno value on error.
* Once it returns, commit_pending will be 0.
*
* Note : Caller must hold the inode flush_lock to ensure that
* no new membufs are added till this call completes.
* It may release the flush_lock() if it has to wait for ongoing
* flush/write requests to complete, but it'll exit with flush_lock
* held.
*/
int wait_for_ongoing_flush();
/**
* commit_membufs() is called to commit uncommitted membufs to the Blob.
* It creates commit RPC and sends it to the NFS server.
*/
void commit_membufs(std::vector<bytes_chunk> &bcs);
/**
* switch_to_stable_write() is called to switch the inode to stable write
* mode. It waits for all ongoing flush and subsequent commit to complete.
* If not already scheduled, it'll perform an explicit commit after the
* flush complete.
* Post that it'll mark inode for stable write and return. From then on
* any writes to this inode will be sent as stable writes.
*/
void switch_to_stable_write();
/**
* Check if stable write is required for the given offset.
* Given offset is the start of contiguous dirty membufs that need to be
* flushed to the Blob.
*/
bool check_stable_write_required(off_t offset);
/**
* Wait for ongoing commit operation to complete.
*/
void wait_for_ongoing_commit();
/**
* Sync the dirty membufs in the file cache to the NFS server.
* All contiguous dirty membufs are clubbed together and sent to the
* NFS server in a single write call.
* If parent_task is non-null, it's the frontend write task that must be
* completed once all these flushes complete. This can be used by the
* caller in case of memory pressure when we want to delay fuse callbacks
* to slow down writes which can cause more memory to be dirtied.
*
* Note: sync_membufs() can free parent_task if all issued backend
* writes complete before sync_membufs() could return.
* DO NOT access parent_task after sync_membufs() returns.
*/
void sync_membufs(std::vector<bytes_chunk> &bcs, bool is_flush,
struct rpc_task *parent_task = nullptr);
/**
* Called when last open fd is closed for a file/dir.
* inode release() drops an opencnt on the inode.
* If this was not the last opencnt or if it's called for a dir, then it
* doesn't do anything more, else it does the following for regular files:
* - If release is called for a silly-renamed file, then it drops the
* cache (no need to flush as the file ie being deleted anyways) and
* unlinks the file.
* - If not a silly-renamed file, then it flushes the cache.
* This is needed for CTO consistency.
*
* When called from a fuse handler, req parameter must be passed and it'll
* arrange to call the fuse callback for req, once it completes the above.
* When not called from a fuse handler, req must not be passed.
*
* It returns true if it wants the caller to call the fuse callback, else
* it has already arranged to call the fuse callback and caller doesn't
* need to call.
*/
bool release(fuse_req_t req = nullptr);
/**
* Lock the inode for flushing.
*
* Note: DO NOT TAKE flush_lock WHILE WAITING FOR NFS WRITE RPC RESPONSE.
* THIS CAN CAUSE A DEADLOCK AS write_iov_callback()->on_flush_complete()
* TAKES THE flush_lock() TOO.
*/
void flush_lock() const;
void flush_unlock() const;
/**
* Revalidate the inode.
* Revalidation is done by querying the inode attributes from the server
* and comparing them against the saved attributes. If the freshly fetched
* attributes indicate "change in file/dir content" by indicators such as
* mtime and/or size, then we invalidate the cached data of the inode.
* If 'force' is false then inode attributes are fetched only if the last
* fetched attributes are older than attr_timeout_secs, while if 'force'
* is true we fetch the attributes regardless. This could f.e., be needed
* when a file/dir is opened (for close-to-open consistency reasons).
* Other reasons for force invalidating the caches could be if file/dir
* was updated by calls to write()/create()/rename().
*
* LOCKS: If revalidating it'll take exclusive ilock_1.
*/
void revalidate(bool force = false);
/**
* Update the inode given that we have received fresh attributes from
* the server. These fresh attributes could have been received as
* postop (and preop) attributes to any of the requests or it could be a
* result of explicit GETATTR call that we make from revalidate() when the
* attribute cache times out.
* We process the freshly received attributes as follows:
* - If the ctime has not changed, then the file has not changed, and
* we don't do anything, else
* - If mtime has changed then the file data and metadata has changed
* and we need to drop the caches and update nfs_inode::attr, else
* - If just ctime has changed then only the file metadata has changed
* and we update nfs_inode::attr from the received attributes.
*
* Returns true if preattr/postattr indicate that file has changed (either
* metadata, or both) since we cached it, false indicates that file has not
* changed.
*
* LOCKS: Caller must take exclusive ilock_1.
*/
bool update_nolock(const struct fattr3 *postattr,
const struct wcc_attr *preattr = nullptr);
/**
* Convenience function that calls update_nolock() after holding the
* inode lock.
*
* LOCKS: Exclusive ilock_1.
*
* XXX This MUST be called whenever we get fresh attributes for a file,
* most commonly as post-op attributes along with some RPC response.
*/
bool update(const struct fattr3 *postattr,
const struct wcc_attr *preattr = nullptr)
{
std::unique_lock<std::shared_mutex> lock(ilock_1);
return update_nolock(postattr, preattr);
}
/**
* Force update inode->attr with fattr.
* Unlike update_nolock() it doesn't invalidate the cache.
* Use it when you know that cache need not be invalidated, as it's
* already done.
*/
void force_update_attr_nolock(const struct fattr3& fattr);
void force_update_attr(const struct fattr3& fattr)
{
std::unique_lock<std::shared_mutex> lock(ilock_1);
force_update_attr_nolock(fattr);
}
/**
* Invalidate/zap the cached data. This will correctly invalidate cached
* data for both file and directory caches.
* By default it will just mark the cache as invalid and the actual purging
* will be deferred till the next access to the cache, and will be done in
* the context that accesses the cache, but the caller can request the cache
* to be purged inline by passing purge_now as true.
*
* 'shutdown' argument doesn't necessarily mean we are calling this from
* shutdown path, but it means that the caller wants to be stricter about
* purging the caches. f.e., for file cache it would mean disregarding
* inuse, dirty or any of the flags that we usually won't purge.
* This is mostly true when calling from shutdown but can be true in
* other cases too.
* shutdown is ignored for directory cache.
*
* We have the following cases (for file cache):
* 1. purge_now=false, shutdown=false: Don't purge now, but on next call to
* bytes_chunk_cache::scan().
* 2. purge_now=true, shutdown=false: Purge now, but skip inuse and dirty
* membufs.
* 3. purge_now=true, shutdown=true: Purge now, forcing purge for inuse and
* dirty membufs too, they should not
* exist and hence we assert for them.
* 4. purge_now=false, shutdown=true: Invalid call.
*
* LOCKS: None when purge_now is false.
* When purge_now is true, exclusive chunkmap_lock_43 for files and
* exclusive readdircache_lock_2 for directories.
*/
void invalidate_cache(bool purge_now = false, bool shutdown = false)
{
/*
* shutdown implies force purging which only makes sense if purge_now
* is true.
*/
assert(!shutdown || purge_now);
if (is_dir()) {
if (has_dircache()) {
assert(dircache_handle);
AZLogDebug("[{}] Invalidating dircache", get_fuse_ino());
dircache_handle->invalidate();
if (purge_now) {
AZLogDebug("[{}] (Purgenow) Purging dircache", get_fuse_ino());
dircache_handle->clear();
AZLogDebug("[{}] (Purgenow) Purged dircache", get_fuse_ino());
}
}
} else if (is_regfile()) {
if (has_filecache()) {
assert(filecache_handle);
AZLogDebug("[{}] Invalidating filecache", get_fuse_ino());
filecache_handle->invalidate();
if (purge_now) {
/*
* Wait for ongoing readaheads to complete, else they would
* not have dropped membuf lock and inuse count, and clear()
* would incorrectly complain.
*/
if (has_rastate()) {
get_rastate()->wait_for_ongoing_readahead();
}
AZLogDebug("[{}] (Purgenow) {}Purging filecache",
get_fuse_ino(), shutdown ? "Force ": "");
filecache_handle->clear(shutdown /* shutdown */);
AZLogDebug("[{}] (Purgenow) {}Purged filecache",
get_fuse_ino(), shutdown ? "Force ": "");
}
}
}
}
/**
* Store the first error encountered while writing dirty
* membuf to Blob.
*/
void set_write_error(int error)
{
assert(error > 0);
if (this->write_error == 0) {
this->write_error = error;
}
}
/**
* Returns the error, saved by prior call to set_write_error().
* Can be 0 for success, or a +ve errno value.
*/
int get_write_error() const
{
assert(write_error >= 0);
return write_error;
}
/**
* Set the stable write flag.
*/
void set_stable_write()
{
assert(!stable_write);
stable_write = true;
// Only unstable writes use putblock_filesize.
putblock_filesize = AZNFSC_BAD_OFFSET;
}
/**
* Check if the inode has stable write flag set.
*/
bool is_stable_write() const
{
return stable_write;
}
/**
* Directory cache lookup method.
*
* cookie: offset in the directory from which the entries should be listed.
* max_size: do not return entries more than these many bytes.
* results: returned entries are populated in this vector. Each of these
* entry has a shared_ptr ref held so they can be safely used even
* if the actual directory_entry in readdirectory_cache is deleted.
* eof: will be set if there are no more entries in the directory, after
* the last entry returned.
* readdirplus: consumer of the returned directory entries is readdirplus.
* This will affect how the size of entries is added while
* comparing with max_size. If readdirplus is true, then we
* account for attribute size too, since readdirplus would
* be sending attributes too.
*/
void lookup_dircache(
cookie3 cookie,
size_t max_size,
std::vector<std::shared_ptr<const directory_entry>>& results,
bool& eof,
bool readdirplus);
};
#endif /* __NFS_INODE_H__ */