turbonfs/inc/nfs_client.h (287 lines of code) (raw):
#ifndef __NFS_CLIENT_H__
#define __NFS_CLIENT_H__
#include <queue>
#include "nfs_inode.h"
#include "rpc_transport.h"
#include "nfs_internal.h"
/**
* This is an informal lock registry for all locks used in the aznfsclient code.
* Any new lock introduced should be added here and it must pick a unique number
* N for its name which is of the form <context>_lock_N. N is the order of the
* lock. A thread can only hold a higher order lock (greater N) then the highest
* order lock it's currently holding, i.e., a thread holding a lock *_lock_N
* cannot hold any lock from *_lock_0 to *_lock_N-1 (it can only hold *_lock_N+1
* and higher order locks).
* - nfs_client::inode_map_lock_0
* - nfs_inode::ilock_1
* - nfs_inode::readdircache_lock_2
* - nfs_inode::iflush_lock_3
* - nfs_client::jukebox_seeds_lock_39
* +++++++++++++++++++++++++++++++++
* - ra_state::ra_lock_40
* - rpc_task_helper::task_index_lock_41
* - rpc_stats_az::stats_lock_42
* - bytes_chunk_cache::chunkmap_lock_43
* - membuf::mb_lock_44
* - membuf::flush_waiters_lock_44
*/
extern "C" {
/*
* libnfs does not offer a prototype for this in any public header,
* but exports it anyway.
*
* TODO: Update libnfs to export this and remove from here.
*/
const struct nfs_fh3* nfs_get_rootfh(struct nfs_context* nfs);
}
/**
* This represents the NFS client. Since we have only one NFS client at a time,
* this is a singleton class.
* Caller can make NFSv3 API calls by calling corresponding methods from this
* class. Those methods will then call into libnfs to make the actual NFS RPC
* User should first init the class by calling init() by specifying all the
* parameters needed to mount the filesystem.
* Once initialized, callers can get the singleton instance of this class by
* calling the get_instance() static method.
* The returned instance can then be used to call the APIs like getattr, write etc.
*/
#define NFS_CLIENT_MAGIC *((const uint32_t *)"NFSC")
/**
* RPC requests that fail with JUKEBOX error are retried after these many secs.
* We try after 5 seconds similar to Linux NFS client.
*/
#define JUKEBOX_DELAY_SECS 5
struct nfs_client
{
const uint32_t magic = NFS_CLIENT_MAGIC;
private:
/*
* This is the RPC transport connected to the NFS server.
* RPC transport is made up of one or more nfs_connection which are used
* to carry the RPC requests/responses.
*/
struct rpc_transport transport;
/*
* Root File Handle obtained after mounting the filesystem.
* This will be set after calling nfs_mount which is done in the init()
* method.
*/
struct nfs_inode *root_fh = nullptr;
/*
* Map of all inodes returned to fuse and which are not FORGET'ed
* by fuse. The idea behind this map is to make sure we never return
* two different fuse_ino_t inode number for the same file, lest it'll
* confuse the VFS layer. This is achieved by adding any inode we
* return to fuse, to this map.
* An inode will be removed from the map only when all the following
* conditions are met:
* 1. inode->lookupcnt becomes 0.
* This confirms that fuse vfs does not have this inode and hence
* it cannnot make any call on this inode.
* 2. inode->dircachecnt becomes 0.
* Whenever we cache directory_entry for readdirplus, the
* directory_entry also refers to the inode and hence we need to
* make sure that the inode is not freed till any directory_entry
* is referring to it.
*/
std::multimap<uint64_t /* fileid */, struct nfs_inode*> inode_map;
mutable std::shared_mutex inode_map_lock_0;
/*
* Every RPC request is represented by an rpc_task which is created when
* the fuse request is received and remains till the NFS server sends a
* response. rpc_task_helper class allows efficient allocation of RPC
* tasks.
*/
class rpc_task_helper *rpc_task_helper = nullptr;
/*
* JUKEBOX errors are handled by re-running the nfs_client handler for the
* given request, f.e., for a READDIRPLUS request failing with JUKEBOX error
* we will call nfs_client::readdirplus() again after JUKEBOX_DELAY_SECS
* seconds. For this we need to save enough information needed to run the
* nfs_client handler. jukebox_seedinfo stores that information and we
* queue that in jukebox_seeds.
*/
std::thread jukebox_thread;
void jukebox_runner();
std::queue<struct jukebox_seedinfo*> jukebox_seeds;
mutable std::mutex jukebox_seeds_lock_39;
/*
* Holds info about the server, queried by FSINFO.
*/
struct nfs_server_info server_info;
/*
* Holds info about the server, queried by FSSTAT.
*/
struct nfs_server_stat server_stat;
#ifdef ENABLE_PARANOID
/*
* Since we use the address of nfs_inode as the inode number we
* return to fuse, this is a small sanity check we do to check if
* fuse is passing us valid inode numbers.
*/
std::atomic<uint64_t> min_ino = UINT64_MAX;
std::atomic<uint64_t> max_ino = 0;
#endif
/*
* Last 5 sec read and write throughput.
* rw_genid is updated everytime these values are updated, so can be used
* to check when throughput is updated.
*/
std::atomic<uint64_t> r_MBps = 0;
std::atomic<uint64_t> w_MBps = 0;
std::atomic<uint64_t> rw_genid = 0;
/*
* Value returned by max_dirty_extent_bytes() is scaled down by this much
* before it's used by:
* - flush_required()
* - commit_required()
* - do_inline_write()
*
* fc_scale_factor is computed by periodic_updater() according to the global
* cache pressure. If global cache pressure is high we want the local
* flush/commit limits to be reduced so that each file flushes/commits
* faster thus easing the global cache pressure. This promotes fair sharing
* of global cache space while also maintaining enough contiguous data to
* the server, needed for better write throughput. Stable and unstable
* write may use this scale factor differently.
*/
static std::atomic<double> fc_scale_factor;
/*
* periodic_updater() will update this scaling factor to force all ra_state
* machines to slow down readahead in case of high memory pressure.
*/
static std::atomic<double> ra_scale_factor;
/*
* Set in shutdown() to let others know that nfs_client is shutting
* down. They can use this to quit what they are doing and plan for
* graceful shutdown.
*/
std::atomic<bool> shutting_down = false;
nfs_client() :
transport(this)
{
}
~nfs_client()
{
AZLogInfo("~nfs_client() called");
/*
* shutdown() should have cleared the root_fh.
*/
assert(root_fh == nullptr);
}
/**
* Internal method used by __get_nfs_inode() for querying nfs_inode from
* inode_map. It returns nfs_inode after holding a lookupcnt ref so caller
* can safely use that w/o worrying about the nfs_inode being removed from
* inode_map.
*/
struct nfs_inode *__inode_from_inode_map(const nfs_fh3 *fh,
const struct fattr3 *fattr,
bool acquire_lock = true,
bool *is_forgotten = nullptr);
public:
/*
* Mount options (to be) used for mounting. These contain details of the
* server and share that's mounted and also the mount options used.
*/
struct mount_options mnt_options;
/*
* Return the instance of the singleton class.
*/
static nfs_client& get_instance()
{
static nfs_client client;
return client;
}
static double get_fc_scale_factor()
{
assert(fc_scale_factor >= 1.0/10);
return fc_scale_factor;
}
static double get_ra_scale_factor()
{
assert(ra_scale_factor >= 0);
return ra_scale_factor;
}
/**
* Returns true if nfs_client is shutting down.
*/
bool is_shutting_down() const
{
return shutting_down;
}
/**
* Must be called on fuse unmount.
* TODO: Audit this to make sure we perform cleanup for all components.
*/
void shutdown();
const struct rpc_transport& get_transport() const
{
return transport;
}
class rpc_task_helper *get_rpc_task_helper()
{
return rpc_task_helper;
}
std::shared_mutex& get_inode_map_lock()
{
return inode_map_lock_0;
}
/**
* Update various stuff that needs to be periodically updated, like:
* - Last 5 sec read and write throughput.
* - Readahead scale factor for controlling readahead amount, and
* - Flush/commit dirty data scale factor for controlling how long we keep
* dirty data before flushing/committing.
*
* Call this from some place that's called very frequently.
*/
void periodic_updater();
/**
* Get last 5 sec read throughput in MBps.
*/
uint64_t get_read_MBps() const
{
return r_MBps;
}
/**
* Get last 5 sec read throughput in MBps.
*/
uint64_t get_write_MBps() const
{
return w_MBps;
}
uint64_t get_rw_genid() const
{
return rw_genid;
}
/*
* The user should first init the client class before using it.
*/
bool init();
/*
* Get the libnfs context on which the libnfs API calls can be made.
*
* csched: The connection scheduling type to be used when selecting the
* NFS context/connection.
* fh_hash: Filehandle hash, used only when CONN_SCHED_FH_HASH scheduling
* mode is used. This provides a unique hash for the file/dir
* that is the target for this request. All requests to the same
* file/dir are sent over the same connection.
*/
struct nfs_context* get_nfs_context(conn_sched_t csched,
uint32_t fh_hash) const;
/*
* Given an inode number, return the nfs_inode structure.
* For efficient access we use the address of the nfs_inode structure as
* the inode number. Fuse should always pass inode numbers we return in
* one of the create APIs, so it should be ok to trust fuse.
* Once Fuse calls the forget() API for an inode, it won't pass that
* inode number in any future request, so we can safely destroy the
* nfs_inode on forget.
*/
struct nfs_inode *get_nfs_inode_from_ino(fuse_ino_t ino)
{
// 0 is not a valid inode number.
assert(ino != 0);
if (ino == FUSE_ROOT_ID) {
// root_fh must have been created by now.
assert(root_fh != nullptr);
assert(root_fh->magic == NFS_INODE_MAGIC);
return root_fh;
}
#ifdef ENABLE_PARANOID
assert(ino >= min_ino);
assert(ino <= max_ino);
#endif
struct nfs_inode *const nfsi =
reinterpret_cast<struct nfs_inode *>(ino);
// Dangerous cast, deserves validation.
assert(nfsi->magic == NFS_INODE_MAGIC);
return nfsi;
}
/**
* Given a filehandle and fattr (oontaining fileid defining a file/dir),
* get the nfs_inode for that file/dir. It searches in the global list of
* all inodes and returns from there if found, else creates a new nfs_inode.
* Note that we don't want to return multiple fuse inodes for the same
* file (represented by the filehandle). If fuse guarantees that it'll
* never make a lookup or any other call that gets a new inode, until
* it calls forget for that inode, then we can probably use different
* inodes for the same file but not at the same time. Since fuse doesn't
* guarantee we play safe and make sure for a given file we use the
* same nfs_inode as long one is cached with us. New incarnation of
* fuse driver will give a different fuse ino for the same file, but
* that should be ok.
* It'll grab a refcnt on the inode before returning. Caller must ensure
* that the ref is duly dropped at an appropriate time. Most commonly
* this refcnt held by get_nfs_inode() is trasferred to fuse and is
* dropped when fuse FORGETs the inode.
* 'is_root_inode' must be set when the inode being requested is the
* root inode. Root inode is special in that it has the special fuse inode
* number of 1, rest other inodes have inode number as the address of
* the nfs_inode structure, which allows fast ino->inode mapping.
*/
struct nfs_inode *__get_nfs_inode(LOC_PARAMS
const nfs_fh3 *fh,
const struct fattr3 *fattr,
bool is_root_inode = false);
#define get_nfs_inode(fh, fattr, ...) \
__get_nfs_inode(LOC_VAL fh, fattr, ## __VA_ARGS__)
/**
* Get various stats related to inodes/files.
*/
void get_inode_stats(uint64_t& total_inodes,
uint64_t& num_files,
uint64_t& num_dirs,
uint64_t& num_symlinks,
uint64_t& open_files,
uint64_t& open_dirs,
uint64_t& num_files_cache_empty,
uint64_t& num_dirs_cache_empty,
uint64_t& num_forgotten,
uint64_t& expecting_forget,
uint64_t& num_dircached,
uint64_t& num_silly_renamed) const;
/**
* Release the given inode, called when fuse FORGET call causes the
* inode lookupcnt to drop to 0, i.e., the inode is no longer in use
* by fuse VFS. Note that it takes a dropcnt parameter which is the
* nlookup parameter passed by fuse FORGET. Instead of the caller
* reducing lookupcnt and then calling put_nfs_inode(), the caller
* passes the amount by which the lookupcnt must be dropped. This is
* important as we need to drop the lookupcnt inside inode_map_lock_0,
* else if we drop before the lock and lookupcnt becomes 0, some other
* thread can delete the inode while we still don't have the lock, and
* then when we proceed to delete the inode, we would be accessing the
* already deleted inode.
*
* If the inode lookupcnt (after reducing by dropcnt), becomes 0 and it's
* not referenced by any readdirectory_cache (inode->dircachecnt is 0)
* then the inode is removed from the inode_map and freed.
*
* This nolock version does not hold inode_map_lock_0 so the caller
* must hold the lock before calling this. Usually you will call one of
* the other variants which hold the lock.
*
* Note: Call put_nfs_inode()/put_nfs_inode_nolock() only when you are
* sure dropping dropcnt refs will cause the lookupcnt to become 0.
* It's possible that before put_nfs_inode() acquires inode_map_lock_0,
* someone may grab a fresh ref on the inode, but that's fine as
* put_nfs_inode_nolock() handles that. Since it expects caller to
* only call it when the inode lookupcnt is going to be 0, it logs
* a "Inode no longer forgotten..." warning log in that case.
*/
void put_nfs_inode_nolock(struct nfs_inode *inode, size_t dropcnt);
void put_nfs_inode(struct nfs_inode *inode, size_t dropcnt)
{
/*
* We need to hold inode_map_lock_0 while we check the inode for
* eligibility to remove (and finally remove) from the inode_map.
*/
std::unique_lock<std::shared_mutex> lock(inode_map_lock_0);
put_nfs_inode_nolock(inode, dropcnt);
}
/*
*
* Define Nfsv3 API specific functions and helpers after this point.
*
* TODO: Add more NFS APIs as we implement them.
*/
void getattr(
fuse_req_t req,
fuse_ino_t ino,
struct fuse_file_info* file);
/**
* Issue a sync GETATTR RPC call to filehandle 'fh' and save the received
* attributes in 'fattr'.
* This is to be used internally and not for serving fuse requests.
*/
bool getattr_sync(const struct nfs_fh3& fh,
fuse_ino_t ino,
struct fattr3& attr);
void statfs(fuse_req_t req, fuse_ino_t ino);
void create(
fuse_req_t req,
fuse_ino_t parent_ino,
const char *name,
mode_t mode,
struct fuse_file_info* file);
void mknod(
fuse_req_t req,
fuse_ino_t parent_ino,
const char *name,
mode_t mode);
void mkdir(
fuse_req_t req,
fuse_ino_t parent_ino,
const char *name,
mode_t mode);
/**
* Try to perform silly rename of the given file (parent_ino/name) and
* return true if silly rename was required (and done), else return false.
* Note that silly rename is required for the following two cases:
*
* 1. When unlinking a file we need to silly rename the file if it has a
* non-zero open count.
* In this case caller just needs to pass parent_ino and name.
* In this case (silly) renaming the to-be-unlinked file is sufficient
* in order to serve the unlink requested by the user.
* 2. When renaming oldparent_ino/old_name to parent_ino/name, after the
* rename parent_ino/name will start referring to the file originally
* referred by oldparent_ino/old_name and in case parent_ino/name existed
* at the time of rename that file would no longer be accessible after
* rename, so it's effectively deleted by the server. Hence we need to
* silly rename it if it has a non-zero open count.
* In this case caller needs to pass parent_ino and name and additionally
* oldparent_ino and old_name. The oldparent_ino and old_name are as such
* not used by silly rename but since the actual rename is performed when
* the silly rename succeeds (from rename_callback()), we need to store
* the oldparent_ino and old_name details in the silly rename task.
* In this case silly_rename() will do the following:
* - silly rename the outgoing file, and if/when silly rename succeeds,
* perform actual rename (oldparent_ino/old_name -> parent_ino/name).
*/
bool silly_rename(
fuse_req_t req,
fuse_ino_t parent_ino,
const char *name,
fuse_ino_t oldparent_ino = 0,
const char *old_name = nullptr);
/**
* for_silly_rename tells if this unlink() call is being made to delete
* a silly-renamed file (.nfs_*), as a result of a release() call from
* fuse that drops the final opencnt on the file. Note that an earlier
* unlink of the file would have caused the file to be (silly)renamed to
* the .nfs_* name and now when the last opencnt is dropped we need to
* delete the .nfs_* file. Since we hold the parent directory inode refcnt
* in rename_callback() for silly renamed files, we need to drop the refcnt
* now.
*/
void unlink(
fuse_req_t req,
fuse_ino_t parent_ino,
const char *name,
bool for_silly_rename);
void rmdir(
fuse_req_t req,
fuse_ino_t parent_ino,
const char* name);
void symlink(
fuse_req_t req,
const char *link,
fuse_ino_t parent_ino,
const char *name);
/**
* silly_rename must be passed as true if this is a silly rename and not
* rename triggered by user. See silly_rename() for explanation of why and
* when we need to silly rename a file. If this rename operation is
* being performed to realize a silly rename, then silly_rename_ino must
* contain the ino of the file that's being silly renamed.
* Also in that case oldparent_ino and old_name refer to the source of the
* actual rename triggered by user.
*
* See comments above init_rename() in rpc_task.h.
*/
void rename(
fuse_req_t req,
fuse_ino_t parent_ino,
const char *name,
fuse_ino_t newparent_ino,
const char *new_name,
bool silly_rename = false,
fuse_ino_t silly_rename_ino = 0,
fuse_ino_t oldparent_ino = 0,
const char *old_name = nullptr);
void readlink(
fuse_req_t req,
fuse_ino_t ino);
void setattr(
fuse_req_t req,
fuse_ino_t ino,
const struct stat* attr,
int to_set,
struct fuse_file_info* file);
void lookup(
fuse_req_t req,
fuse_ino_t parent_ino,
const char* name);
/**
* Sync version of lookup().
* This is to be used internally and not for serving fuse requests.
* It returns 0 if we are able to get a success response for the
* LOOKUP RPC that we sent, in that case child_ino will contain the
* child's fuse inode number.
* In case of a failed lookup it'll return a +ve errno value.
*/
int lookup_sync(
fuse_ino_t parent_ino,
const char *name,
fuse_ino_t& child_ino);
void access(
fuse_req_t req,
fuse_ino_t ino,
int mask);
void write(
fuse_req_t req,
fuse_ino_t ino,
struct fuse_bufvec *bufv,
size_t size,
off_t off);
void flush(
fuse_req_t req,
fuse_ino_t ino);
void readdir(
fuse_req_t req,
fuse_ino_t ino,
size_t size,
off_t off,
struct fuse_file_info* file);
void readdirplus(
fuse_req_t req,
fuse_ino_t ino,
size_t size,
off_t off,
struct fuse_file_info* file);
void read(
fuse_req_t req,
fuse_ino_t ino,
size_t size,
off_t off,
struct fuse_file_info *fi);
void jukebox_read(struct api_task_info *rpc_api);
void jukebox_write(struct api_task_info *rpc_api);
void jukebox_flush(struct api_task_info *rpc_api);
/**
* Convert between NFS fattr3 and POSIX struct stat.
*/
static void stat_from_fattr3(struct stat& st, const struct fattr3& fattr);
static void fattr3_from_stat(struct fattr3& fattr, const struct stat& st);
void reply_entry(
struct rpc_task* ctx,
const nfs_fh3* fh,
const struct fattr3* attr,
const struct fuse_file_info* file);
/**
* Call this to handle NFS3ERR_JUKEBOX error received for rpc_task.
* This will save information needed to re-issue the call and queue
* it in jukebox_seeds from where jukebox_runner will issue the call
* after JUKEBOX_DELAY_SECS seconds.
*/
void jukebox_retry(struct rpc_task *task);
};
/**
* Sync RPC calls can use this context structure to communicate between
* issuer and the callback.
*/
#define SYNC_RPC_CTX_MAGIC *((const uint32_t *)"SRCX")
struct sync_rpc_context
{
const uint32_t magic = SYNC_RPC_CTX_MAGIC;
/*
* Set by the callback to convey that callback is indeed called.
* Issuer can find this to see if it timed out waiting for the callback.
*/
bool callback_called = false;
/*
* RPC and NFS status, only valid if callback_called is true.
* Also, nfs_status is only valid if rpc_status is RPC_STATUS_SUCCESS.
*/
int rpc_status = -1;
int nfs_status = -1;
/*
* Condition variable on which the issuer will wait for the callback to
* be called.
*/
std::condition_variable cv;
std::mutex mutex;
/*
* The rpc_task tracking the actual RPC call.
*/
struct rpc_task *const task;
/*
* Most NFS RPCs carry postop attributes. If this is not null, callback
* will fill this with the postop attributes received.
*/
struct fattr3 *const fattr = nullptr;
sync_rpc_context(struct rpc_task *_task, struct fattr3 *_fattr):
task(_task),
fattr(_fattr)
{
}
};
/**
* nfs_client is a singleton, so this just returns the singleton instance
* pointer.
* We also store the nfs_client pointer inside the fuse req private pointer.
* We use that for asserting.
*/
static inline
struct nfs_client *get_nfs_client_from_fuse_req(
[[maybe_unused]] const fuse_req_t req = nullptr)
{
struct nfs_client *const client = &nfs_client::get_instance();
#ifndef ENABLE_NO_FUSE
#ifdef ENABLE_PARANOID
assert(client == reinterpret_cast<struct nfs_client*>(fuse_req_userdata(req)));
#endif
#else
/*
* In nofuse mode req must be a pointer to posix_task.
*/
assert(_FR2PXT(req)->magic == POSIX_TASK_MAGIC);
#endif
// Dangerous cast, make sure we got a correct pointer.
assert(client->magic == NFS_CLIENT_MAGIC);
return client;
}
#endif /* __NFS_CLIENT_H__ */