turbonfs/inc/rpc_readdir.h (182 lines of code) (raw):
#ifndef __READDIR_RPC_TASK__
#define __READDIR_RPC_TASK__
#include "aznfsc.h"
#include <map>
#include <shared_mutex>
#include <vector>
#include <ctime>
#include <dirent.h>
/*
* 1GB should be sufficient to support ~4M entries in a directory, of course it
* will vary based on file name lengths.
*/
#define MAX_CACHE_SIZE_LIMIT (1024ULL * 1024 * 1024)
/*
* This is an entry in the unified readdir/DNLC cache.
* Note that an entry can be added to the unified cache, via one of the
* following:
* 1. READDIRPLUS response.
* This creates the most complete entry with a valid cookie and a valid
* nfs_inode pointer (with attributes and filehandle).
* This can serve: READDIRPLUS, READDIR and LOOKUP requests.
* This is referred to as a Type (1) entry.
* 2. READDIR response.
* This creates an entry with a valid cookie but no nfs_inode pointer.
* Only attributes.st_ino (the inode number) is valid.
* This can serve: READDIR requests.
* This is referred to as a Type (2) entry.
* 3. LOOKUP response.
* This creates an entry with a special cookie (which is not possible in
* READDIR/READDIRPLUS responses) but a valid nfs_inode pointer.
* This is referred to as a Type (3) entry.
* This can serve: LOOKUP requests. Though it has the nfs_inode pointer
* it doesn't have the cookie, hence cannot serve directory
* enumeration requests which need a valid cookie.
*
* Note: Blob NFS uses cookies starting at 1 and increasing by 1 for every
* file, so we use UINT64_MAX/2 as the starting value for the special
* cookie. This should never be returned in READDIR/READDIRPLUS response
* hence we won't mistake a type (3) entry as type (1).
* Note on updating directory entries added in readdirectory_cache.
*
* READDIR and READDIRPLUS responses will always update old entries, deleting
* existing ones and adding new ones. This means if we have a type (1) entry
* and we get a READDIR response, it'll be deleted and a new type (2) entry
* will be created.
*
* LOOKUP response will update the entry with the following rules:
* Note that we don't want to blindly replace type (1) or (2) entries with
* type (3) entries as those are not usable by READDIR/READDIRPLUS then.
*
* - If we have a type (1) entry and the new nfs_inode in the lookup response
* matches the saved one, don't do anything. This is the common case.
* - If we have a type (1) entry and the new nfs_inode does not match the saved
* one, it means the file was either renamed or deleted+recreated. Next
* time when aznfsc_ll_readdir{plus}() is called it'll purge the entire
* readdir cache as the parent directory mtime would be different, thus
* ensuring correctness, but if lookup is called before readdir/readdirplus
* it'll delete the old entry and create a new type (3) entry.
* - If we have a type (2) entry delete that and add a new entry with the same
* cookie and the new nfs_inode received in LOOKUP response.
* - If we have a type (3) entry and the new nfs_inode matches the saved one,
* don't do anything.
* - If we have a type (3) entry and the new nfs_inode does not match the
* saved one, delete the old entry and create a new type (3) entry.
*/
struct directory_entry
{
const cookie3 cookie;
const struct stat attributes;
/*
* whether 'attributes' holds valid attributes?
* directory_entry which are made as a result of READDIR call, would
* not have the attributes. Those can only be used by subsequent
* readdir calls made by fuse. If fuse makes readdirplus call and
* we don't have the attributes, we treat it as "entry not found"
* and reach out to server with a READDIRPLUS call and on receipt
* of response update the directory_entry cache, this time with
* attributes.
*/
const bool has_attributes;
/*
* Again, for READDIR fetched entries, we won't know the filehandle
* (and the fileid), hence we won't have the inode set.
*/
struct nfs_inode *const nfs_inode;
const char *const name;
// Constructor for adding a readdirplus returned entry.
directory_entry(char* name_,
cookie3 cookie_,
const struct stat& attr,
struct nfs_inode* nfs_inode_);
// Constructor for adding a readdir returned entry.
directory_entry(char* name_,
cookie3 cookie_,
uint64_t fileid_);
~directory_entry();
/**
* Returns size of the directory_entry.
* This is used to find the cache space taken by this directory_entry.
*/
size_t get_cache_size() const
{
/*
* Since we store this directory_entry in a map, it will have two
* pointers and a key and value, all 8 bytes each, so we add those
* to get a closer estimate.
*
* Note: For usual filename lengths it comes to ~250 bytes.
* Note: It may take slightly more than this.
*/
return sizeof(*this) + ::strlen(name) + 4*sizeof(uint64_t);
}
/**
* Return size of fuse buffer required to hold this directory_entry.
* If readdirplus is true, the size returned is for containing the
* entry along with the attributes, else it's w/o the attributes.
*/
size_t get_fuse_buf_size([[maybe_unused]] bool readdirplus) const
{
#ifndef ENABLE_NO_FUSE
if (readdirplus) {
return fuse_add_direntry_plus(
nullptr, nullptr, 0, name, nullptr, 0);
} else {
return fuse_add_direntry(
nullptr, nullptr, 0, name, nullptr, 0);
}
#else
/*
* In nofuse mode we just add dirent objects to user buffer.
*/
return sizeof(struct dirent) + ::strlen(name);
#endif
}
static bool is_dot_or_dotdot(const char *name)
{
return (name != nullptr) &&
((name[0] == '.') &&
((name[1] == '\0') ||
((name[1] == '.') && (name[2] == '\0'))));
}
bool is_dot_or_dotdot() const
{
return is_dot_or_dotdot(name);
}
};
/**
* This is our unified readdir and DNLC cache.
*/
struct readdirectory_cache
{
private:
/*
* The singleton nfs_client, for convenience.
*/
struct nfs_client *const client;
/*
* Directory inode, whose contents are cached by this readdirectory_cache.
*/
struct nfs_inode *const dir_inode;
/*
* This will be set if we have read all the entries of the directory
* from the backend.
*/
bool eof;
/*
* last cookie.
* Only valid if eof is true.
*/
uint64_t eof_cookie = (uint64_t) -1;
/*
* Last cookie of the sequence that started at the start of the directory.
* If the sequence goes all the way upto eof w/o any gaps in between then
* we can mark the directory as "confirmed", in set_eof().
* It means that we have the entire directory in our cache and hence DNLC
* cache can reply to negative lookup with certainty.
* This is reset when we purge the cache.
*/
uint64_t seq_last_cookie = 0;
// Size of the cache.
size_t cache_size;
cookieverf3 cookie_verifier;
/*
* This readdirectory_cache can be used only for serving lookup queries,
* those made through dnlc_lookup(). It MUST NOT be used for serving
* readdir/readdirplus queries, made through lookup().
* readdirectory_cache marked lookuponly is not exactly in sync with the
* directory contents (one or more file/dir has been created/deleted) since
* the directory contents were last enumerated and cached, though it can
* be used to serve dnlc_lookup() requests which query a specific filename
* and do not need the cookie to be correctly set.
*
* Note: If lookuponly is set, readdirectory_cache must be purged before
* serving any lookup() request.
*
* Q: When is a readdirectory_cache marked lookuponly?
* A: This is an optimization to allow dnlc_lookup() operation on a
* readdirectory_cache after one or more file/dir is created or deleted
* inside the directory. Note that when a file/dir is created inside a
* directory we cannot keep using the readdirectory_cache for serving
* directory enumeration queries as we cannot add a newly created file
* or dir to the readdirectory_cache, since we need to add the cookie
* as well. Instead of purging the readdirectory_cache on creation
* or removal of a file/dir we instead mark it as lookuponly so that
* we can continue to serve dnlc_lookup() queries. When lookup() is
* called we then purge the readdirectory_cache just before enumerating
* the directory.
*
* Q: Why can't we remove a file/dir from the readdirectory_cache and
* continue to use the readdirectory_cache for serving lookup() requests?
* A: Yes, unlike the create case, for delete case we can safely delete
* a readdirectory_cache entry, but we still cannot do it due to the
* way Blob NFS readdir cache works.
* Since we have the directory entries cached, when fuse calls readdir
* again we serve from the cache till we reach the deleted entry's cookie
* which we don't find in the readdir cache. This causes us to make a
* READDIR{PLUS} call to the server with the given cookie and the stored
* cookieverifier. This will cause the server to return the deleted entry
* also as it'll be in the readdir cache for that cookieverifier.
* This is not correct, hence we mark the readdirectory_cache as
* lookuponly even when a file/dir is deleted.
*/
std::atomic<bool> lookuponly = false;
/*
* Absolute time in msecs since epoch when this directory cache was last
* confirmed. A directory is said to be "confirmed" when we know that we
* have the full directory cached and hence we can respond to -ve lookup
* requests with confidence. By definition every newly created directory
* starts as confirmed.
*/
std::atomic<uint64_t> confirmed_msecs = 0;
/*
* dir_entries is the readdir cache, indexed by cookie value.
* We double readdir cache as DNLC cache too. dnlc_map is used to convert
* filename (which is the index into the DNLC cache) to cookie (which is
* the index into the readdir cache).
* dir_entries contains shared_ptr of directory_entry objects, thus
* lookup_dircache() can safely return a vector of directory_entry objects
* w/o worrying of them being deleted by an unlink() or some other call
* right after.
* Original ref to the shared_ptr is held when directory_entry is added to
* dir_entries by readdirectory_cache::add().
*/
std::map<cookie3, std::shared_ptr<struct directory_entry>> dir_entries;
std::unordered_map<std::string, cookie3> dnlc_map;
/*
* This lock protects all the members of this readdirectory_cache.
*/
mutable std::shared_mutex readdircache_lock_2;
/*
* Flag to quickly mark the cache as invalid w/o purging the entire
* cache. Once invalidate_pending is set, next cache lookup will first
* purge the cache.
*/
std::atomic<bool> invalidate_pending = false;
public:
readdirectory_cache(struct nfs_client *_client,
struct nfs_inode *_inode):
client(_client),
dir_inode(_inode),
eof(false),
cache_size(0)
{
assert(client);
assert(dir_inode);
assert(dir_entries.empty());
// Initial cookie_verifier must be 0.
::memset(&cookie_verifier, 0, sizeof(cookie_verifier));
readdirectory_cache::num_caches++;
}
~readdirectory_cache();
/**
* Call this to check if the cache is empty.
*/
bool is_empty() const
{
return dir_entries.empty();
}
// This is helpul for asserting.
size_t get_num_entries() const
{
return dir_entries.size();
}
/*
* Return true and populates the \p dirent if the entry corresponding
* to \p cookie exists.
* Returns false otherwise.
*
* Note: The returned directory_entry shared_ptr holds an extra ref, so
* caller can safely use it even if the original directory_entry
* stored in dir_entries is deleted.
*/
bool get_entry_at(cookie3 cookie, std::shared_ptr<directory_entry>& dirent)
{
// Take shared lock on the map.
std::shared_lock<std::shared_mutex> lock(readdircache_lock_2);
auto it = dir_entries.find(cookie);
if (it != dir_entries.end())
{
dirent = it->second;
return true;
}
return false;
}
/**
* Accessor methods for lookuponly.
*/
void set_lookuponly();
void clear_lookuponly();
bool is_lookuponly() const;
/**
* Set this directory cache as "confirmed".
*/
void set_confirmed();
void clear_confirmed();
/**
* Is this directory cache confirmed?
* If readdirectory_cache lookup returns no entry and is_confirmed()
* returns true, then we can return a negative lookup response to fuse.
* Depending on the config we may only consider if the directory was
* confirmed no longer than a certain period.
*/
bool is_confirmed() const;
/**
* add() will add entry to dir_entries after bumping the shared_ptr ref.
* When called from readdir_callback()/readdirplus_callback(), cookieverf
* received is also passed. add() then atomically addds the directory_entry
* as well as updates the cookieverf stored in the readdirectory_cache.
* It's important to atomically update cookie and cookieverf in the
* readdirectory_cache since anyone looking up the cache and finding a
* cookie may use the cookieverf to query subsequent entries from the
* server.
*/
bool add(const std::shared_ptr<struct directory_entry>& entry,
const cookieverf3 *cookieverf,
bool acquire_lock = true);
void dnlc_add(const char *filename, struct nfs_inode *inode);
/*
* TODO: Access to this must be synchronized.
*/
const cookieverf3* get_cookieverf() const
{
return &cookie_verifier;
}
bool get_eof() const
{
return eof;
}
uint64_t get_eof_cookie() const
{
return eof_cookie;
}
uint64_t get_seq_last_cookie() const
{
return seq_last_cookie;
}
void set_cookieverf_nolock(const cookieverf3 *cookieverf)
{
assert(cookieverf != nullptr);
#ifndef ENABLE_NON_AZURE_NFS
/*
* We store the cookieverf returned by the server in response to
* READDIR{PLUS} RPC. Blob NFS server should not return 0 as cookieverf.
*/
assert(cv2i(*cookieverf) != 0);
#endif
::memcpy(&cookie_verifier, cookieverf, sizeof(cookie_verifier));
}
void set_cookieverf(const cookieverf3 *cookieverf)
{
assert(cookieverf != nullptr);
std::unique_lock<std::shared_mutex> lock(readdircache_lock_2);
set_cookieverf_nolock(cookieverf);
}
void set_eof(uint64_t eof_cookie);
/**
* Given a filename, returns the cookie corresponding to that.
* The cookie returned is the one returned for this filename, by the latest
* READDIR/READDIRPLUS response.
* A return value of 0 means the file was not found in the cache.
*
* Note: Caller MUST hold readdircache_lock_2.
*/
cookie3 filename_to_cookie(const char *filename) const
{
const auto it = dnlc_map.find(filename);
const cookie3 cookie = (it == dnlc_map.end()) ? 0 : it->second;
#ifndef ENABLE_NON_AZURE_NFS
/*
* Blob NFS uses 1:1 mapping between cookie and files, so the
* cookie value issued by Blob NFS can be safely assumed to be less
* than UINT32_MAX, as we won't have 4B entries in a directory.
* For entries added by lookup, readdirectory_cache::dnlc_add() uses
* cookie value counting up from UINT64_MAX>>1.
*/
assert((cookie < UINT32_MAX) || (cookie >= (UINT64_MAX >> 1)));
#endif
return cookie;
}
/**
* Lookup and return the directory_entry corresponding to the
* given cookie.
* lookup() is the readdir cache lookup method, while dnlc_lookup() is
* the DNLC cache lookup method.
*
* Note: lookup() returns after holding a dircachecnt ref on the inode,
* while dnlc_lookup() holds a lookupcnt ref on the inode.
* Caller must drop this extra ref held.
*
* Note: lookup() returns the directory_entry shared_ptr with an extra
* ref held so that the returned directory_entry is safe to use
* even if the corresponding actual directory_entry in dir_entries
* is deleted after lookup() returns.
*/
std::shared_ptr<struct directory_entry> lookup(
cookie3 cookie,
const char *filename_hint = nullptr,
bool acquire_lock = true) const;
struct nfs_inode *dnlc_lookup(const char *filename,
bool *negative_confirmed = nullptr) const;
/**
* Remove the given cookie from readdirectory_cache.
* Returns false if the cookie was not found, else it delete the cookie
* and returns true. It also deletes the inode if this was the last ref
* on the inode.
* remove() is the readdir cache delete method, while dnlc_remove() is
* the DNLC cache delete method.
*/
bool remove(cookie3 cookie,
const char *filename_hint = nullptr,
bool acquire_lock = true);
bool dnlc_remove(const char *filename)
{
assert(filename != nullptr);
return remove(0, filename);
}
/**
* Remove all entries from the cache.
* Also delete the inodes for those entries for which this was the last
* ref.
*/
void clear(bool acquire_lock = true);
void invalidate()
{
invalidate_pending = true;
}
/**
* clear directory cache if pending. Directory cache is cleared in the
* following cases
* 1. readdirectory_cache is marked lookuponly.
* 2. readdirectory_cache has invalidate_pending set.
*/
void clear_if_needed();
/*
* Global stats for all caches.
*/
static std::atomic<uint64_t> num_caches;
// Cum dirents caches across all readdir caches.
static std::atomic<uint64_t> num_dirents_g;
// Readdir calls made by fuse.
static std::atomic<uint64_t> num_readdir_calls_g;
// Readdirplus calls made by fuse.
static std::atomic<uint64_t> num_readdirplus_calls_g;
// Cum dirents returned to fuse.
static std::atomic<uint64_t> num_dirents_returned_g;
// Total bytes consumed by all readdir caches.
static std::atomic<uint64_t> bytes_allocated_g;
};
#endif /* __READDIR_RPC_TASK___ */