kif.yaml (92 lines of code) (raw):

rules: loggy: description: 'loggy hogging all the memory' procid: - 'python2.7' - 'loggy.py' - '--daemonize' - '--user=root' - '--group=root' triggers: maxmemory: 1024mb runlist: - 'service loggy restart' notify: none postfix: description: 'postfix memory hogging prevention' procid: '/usr/lib/postfix/master' triggers: maxmemory: 50% maxfds: 10240 runlist: - 'service postfix restart' slapd: description: 'LDAP memory hogging prevention' procid: '/usr/sbin/slapd' triggers: maxmemory: 50% maxfds: 1024 runlist: - 'service slapd restart' elastic: description: 'Snappy ElasticSearch OOM' procid: '/etc/elasticsearch/asful' triggers: maxmemory: 40gb maxfds: 99999 runlist: - 'service elasticsearch-asful restart' httpd: description: 'httpd too many backend connections (pool filling up?)' procid: '/usr/sbin/apache2' # Use combine: true to combine the resource of multiple processes into one check. combine: true triggers: maxlocalconns: 1000 runlist: - 'service apache2 restart' gitwebzombies: description: 'Any gitweb process caught in zombie mode' procid: '/usr/bin/git' triggers: # This can be any process state (zombie, sleeping, running, etc) # Or a git process > 30 minutes old. state: 'zombie' maxage: 30m kill: true killwith: 9 gitcorezombies: host_must_match: "gitbox2-he-fi.apache.org" description: 'Git core procs that are hanging on gitbox' procid: '/usr/lib/git-core/git-http-backend' triggers: state: 'zombie' maxage: 12h kill: true killwith: 9 staged_git_frozen: host_must_match: "(staging-vm-he-de|tlpserver-he-fi).apache.org" description: 'Git checkouts stuck on tlpserver-he-fi' procid: '/usr/lib/git-core/git-remote-https' triggers: maxage: 30m kill: true killwith: 9 jenkins_node_hung_processes: host_must_match: "asf9\\d+.gq1.ygridcore.net" description: "kill -9 jenkins procs that are hanging" procid: "" uid: jenkins # Find all processes created more than 36 hours ago. triggers: maxage: 36h # Ignore a subset of jenkins processes # including the slave jar and the remote master ssh connection ignorematch: - agent.jar - remoting.jar - slave.jar - sshd - systemd - sd-pam # Kill it with signal 9 kill: true killwith: 9 # notify via email to keep an eye on what's getting kiffed notify: email notifications: email: rcpt: 'notifications@infra.apache.org' from: 'KIF <kif@apache.org>'