4.File.Cache/nfs.py (72 lines of code) (raw):

#!/usr/bin/python3 import gc import time import subprocess from prometheus_client import Gauge, Counter, start_http_server current_cookies = Gauge("current_cookies", "Current number of cookies (inodes) in cache") current_volumes = Gauge("current_volumes", "Current number of volume cookies (volumes) in cache") current_vol_col = Counter("current_vol_col", "Number of volume index key collisions in cache") current_vol_oom = Counter("current_vol_oom", "Number of out of memory events when allocating volume cookies") acquire_cookies = Counter("acquire_cookies", "Current number of cookies acquired for files") acquire_successful = Counter("acquire_successful", "Number of successful attempts to acquire cookies for files") acquire_oom = Counter("aquire_oom", "Number of attempts to acquire cookies that failed due to out of memory") lru_current = Gauge("lru_current", "Current number of cookies in the LRU cache") lru_expired = Counter("lru_expired", "Number of cookies that have been processed (expired) in the LRU cache") lru_removed = Counter("lru_removed", "Number of cookies that have been removed from the LRU cache") lru_dropped = Counter("lru_dropped", "Number of cookies that have been relinquished or withdrawn from the LRU cache") lru_cull = Gauge("lru_cull", "Time (in jiffies) until the next culling (processing) of the LRU cache") inval_cookies = Counter("inval_cookies", "Number of cookies invalidated (removed) from the cache") update_cookies = Counter("update_cookies", "Number of update cookies sent to cache") resize_requests = Counter("resize_requests", "Number of resize requests") resize_skips = Counter("resize_skips", "Number of skipped resize requests") relinquish_cookies = Counter("relinquish_cookies", "Number of relinquish cookie requests") relinquish_retires = Counter("relinquish_retires", "Number of relinquish requests where retire=true") relinquish_drops = Counter("relinquish_drops", "Number of cookies no longer blocking reacquisition") nospace_writes = Counter("nospace_writes", "Number of failed cache writes due to no space in cache") nospace_creates = Counter("nospace_creates", "Number of failed cache creates due to no space in cache") nospace_cull = Counter("nospace_cull", "Number of objects culled to make space when no space occurs") io_reads = Counter("io_reads", "Number of read operations by the cache") io_writes = Counter("io_writes", "Number of write operations by the cache") # get_stats() function # # Process the lines of /proc/fs/fscache/stats to get the metrics data. As each line is different, # we're going to use the first column to differentiate, since it has the type. # # In the case of counters, because we want to use rate and irate, and because the .inc method # adds the current value to the previous one (not preferred) rather than setting the new value (preferred), # we are going to use the private method ._value.set instead. With this in mind, counter values must either be # zero or positive. A negative will reset the counter, breaking graphing. The counter should only reset when # the process does. There is another option, using Gauges, but that makes rate calculations harder and it doesn't # allow for irate calculations. Information is in the Stack Overflow conversation below: # # https://stackoverflow.com/questions/47929310/how-update-counter-set-new-value-after-avery-request-not-increment-new-value-t def get_stats(): cache_stats = [] results = subprocess.run(["cat /proc/fs/fscache/stats"], stdout=subprocess.PIPE, text=True, shell=True) for line in (results.stdout.splitlines()): cache_stats.append(line.replace(":", "").split()) for line in (cache_stats): if (line[0] == "Cookies"): current_cookies.set(line[1].split("=")[1]) current_volumes.set(line[2].split("=")[1]) current_vol_col._value.set(int(line[3].split("=")[1])) current_vol_oom._value.set(int(line[4].split("=")[1])) if (line[0] == "Acquire"): acquire_cookies._value.set(int(line[1].split("=")[1])) acquire_successful._value.set(int(line[2].split("=")[1])) acquire_oom._value.set(int(line[3].split("=")[1])) if (line[0] == "LRU"): lru_current.set(line[1].split("=")[1]) lru_expired._value.set(int(line[2].split("=")[1])) lru_removed._value.set(int(line[3].split("=")[1])) lru_dropped._value.set(int(line[4].split("=")[1])) lru_cull.set(line[5].split("=")[1]) if (line[0] == "Invals"): inval_cookies._value.set(int(line[1].split("=")[1])) if (line[0] == "Updates"): update_cookies._value.set(int(line[1].split("=")[1])) resize_requests._value.set(int(line[2].split("=")[1])) resize_skips._value.set(int(line[3].split("=")[1])) if (line[0] == "Relinqs"): relinquish_cookies._value.set(int(line[1].split("=")[1])) relinquish_retires._value.set(int(line[2].split("=")[1])) relinquish_drops._value.set(int(line[3].split("=")[1])) if (line[0] == "NoSpace"): nospace_writes._value.set(int(line[1].split("=")[1])) nospace_creates._value.set(int(line[2].split("=")[1])) nospace_cull._value.set(int(line[3].split("=")[1])) if (line[0] == "IO"): io_reads._value.set(int(line[1].split("=")[1])) io_writes._value.set(int(line[2].split("=")[1])) if __name__ == "__main__": start_http_server(${metricsCustomStatsPort}) while True: get_stats() gc.collect() time.sleep(${metricsIntervalSeconds})