playbooks/roles/cyclecloud_cluster/projects/openpbs/cluster-init/files/autostop.rb (79 lines of code) (raw):

#!/usr/bin/env ruby # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # require 'json' # Arguments AUTOSTOP_ENABLED = `jetpack config cyclecloud.cluster.autoscale.stop_enabled`.downcase.strip == "true" KEEPALIVE_THROTTLE = `jetpack config cyclecloud.cluster.autoscale.keep_alive_throttle 1800`.to_i KEEPALIVE_FILE = '/opt/cycle/jetpack/run/node.keepalive'.freeze # Short-circuit without error if not enabled exit 0 unless AUTOSTOP_ENABLED def log(msg) now = Time.now $stdout.write "#{now}: #{msg}\n" end # If a node is marked KeepAlive==true, then we don't want to autostop. def is_keepalive? # we want to throtte how often we re-check that KeepAlive is true if File.exist?(KEEPALIVE_FILE) then mtime = File.mtime KEEPALIVE_FILE if (Time.new - mtime) > KEEPALIVE_THROTTLE then log "Deleting #{KEEPALIVE_FILE} because it has been more than #{KEEPALIVE_THROTTLE} seconds since it was created." log "Will be recreated the next iteration." File.delete KEEPALIVE_FILE end # either way, let's return true here. I want to give the autoscaler on the master a chance to terminate this # instance. return true end log "Checking to see if this node has KeepAlive=true" username = `jetpack config cyclecloud.config.username`.strip password = `jetpack config cyclecloud.config.password`.strip web_server = `jetpack config cyclecloud.config.web_server`.strip cluster_name = `jetpack config cyclecloud.cluster.name`.strip node_name = `jetpack config cyclecloud.node.name`.strip cluster_status = JSON.parse(`curl -k -u "#{username}:#{password}" #{web_server}/clusters/#{cluster_name}/status?nodes=true`) # Assume true so that if we did not even find ourselves we will treat this as keepalive=true # if something is wrong with CC and it isn't reporting this node, I don't want to keep hammering it. am_i_keepalive = true cluster_status["nodes"].each do |node| if node["Name"] == node_name then am_i_keepalive = node["KeepAlive"] end end # cache the response by touching this file so we can throttle if am_i_keepalive then log "This node does have KeepAlive=true. Creating #{KEEPALIVE_FILE} and will check again in #{KEEPALIVE_THROTTLE} seconds" touch = File.new(KEEPALIVE_FILE, 'w') touch.puts "This file was created because this node was marked as KeepAlive when this file was created. Will try again in #{KEEPALIVE_THROTTLE} seconds" touch.close end return am_i_keepalive end IDLE_TIME_AFTER_JOBS = `jetpack config cyclecloud.cluster.autoscale.idle_time_after_jobs`.to_i IDLE_TIME_BEFORE_JOBS = `jetpack config cyclecloud.cluster.autoscale.idle_time_before_jobs`.to_i # Checks to see if we should shutdown idle_long_enough = false # indicates if execute node has ever ran a job def been_active? # Shell out to grep with -m 1 for lazy match, as the log files can grow quite large # and we only need to know if one job has ever started. any_job = `egrep -m 1 ';pbs_mom;Job;.+;Started' /var/spool/pbs/mom_logs/*`.strip any_job.length > 0 end # indicates if there are currently running jobs def active? activejobs = Dir.glob('/var/spool/pbs/mom_priv/jobs/*').count activejobs > 0 end # This is our autoscale runtime configuration runtime_config = {"idle_start_time" => nil} AUTOSCALE_DATA = '/opt/cycle/jetpack/run/autoscale.json'.freeze if File.exist?(AUTOSCALE_DATA) file = File.read(AUTOSCALE_DATA) runtime_config.merge!(JSON.parse(file)) end if active? runtime_config["idle_start_time"] = nil elsif runtime_config["idle_start_time"].nil? runtime_config["idle_start_time"] = Time.now.to_i else idle_seconds = Time.now - Time.at(runtime_config["idle_start_time"].to_i) # Different timeouts if the node has ever run a job timeout = if been_active? IDLE_TIME_AFTER_JOBS else IDLE_TIME_BEFORE_JOBS end idle_long_enough = idle_seconds > timeout end # Write the config information back for next time file = File.new(AUTOSCALE_DATA, "w") file.puts JSON.pretty_generate(runtime_config) file.close # Do the shutdown if it is idle _and_ the node wasn't marked as KeepAlive. # KeepAlive check is a bit expensive, so we will only check it if we are already idle # and we also only check every 30 minutes by default. if idle_long_enough && !is_keepalive? myhost = `hostname` system("bash -lc 'pbsnodes -o #{myhost}'") sleep(5) system("jetpack shutdown --idle") end