in tools/jenkins-slave-creation-unix/scripts/deploy/slave-autoconnect.py [0:0]
def main():
try:
logging.getLogger().setLevel(logging.DEBUG)
parser = argparse.ArgumentParser()
parser.add_argument('-m', '--master',
help='URL of jenkins master',
# default='http://jenkins.mxnet-ci.com',
type=str)
parser.add_argument('-mf', '--master-file',
help='File containing URL of jenkins master',
# default='/home/jenkins_slave/jenkins_master_url',
type=str)
parser.add_argument('-mp', '--master-private',
help='Private URL of jenkins master',
# default='http://jenkins-priv.mxnet-ci.com',
type=str)
parser.add_argument('-mpf', '--master-private-file',
help='File containing private URL of jenkins master',
# default='/home/jenkins_slave/jenkins_master_private_url',
type=str)
parser.add_argument('-snf', '--slave-name-file',
help='File containing name of the slave slot',
type=str)
args = parser.parse_args()
master_url, master_private_url = read_master_urls(args)
slave_name = read_name_from_path(args.slave_name_file)
# Replace \ by / on URL due to windows using \ as default separator
jenkins_slave_jar_url = os.path.join(master_url, AGENT_SLAVE_JAR_PATH).replace('\\', '/')
# Download jenkins slave jar
download(jenkins_slave_jar_url, LOCAL_SLAVE_JAR_PATH)
work_dir = os.path.join(os.getcwd(), 'workspace')
logging.info('Work dir: {}'.format(work_dir))
# Create work dir if it doesnt exist
os.makedirs(work_dir, exist_ok=True)
os.makedirs(os.path.join(work_dir, 'remoting'), exist_ok=True)
server = jenkins.Jenkins(master_url)
i = 0
while i < RETRY_LIMIT:
i += 1
if slave_name:
logging.info('Entering manual connect mode to slave slot {}'.format(slave_name))
offline_nodes = [slave_name]
else:
logging.info('Entering auto connect mode')
label = generate_node_label()
logging.info('Local node prefix: {}'.format(label))
nodes = server.get_nodes()
offline_nodes = [node['name'] for node in
list(filter(functools.partial(is_offline_node_matches_prefix, label), nodes))]
logging.debug('Offline nodes: {}', offline_nodes)
# Shuffle to provide random order to reduce race conditions if multiple instances
# are started at the same time and thus try to connect to the same slot, possibly
# resulting in a hang
random.shuffle(offline_nodes)
if len(offline_nodes) == 0:
rename_instance('error-no-free-slot')
logging.fatal('Could connect to master - no free slots')
return 1
reset = False
# Loop through nodes and try to connect
for node_name in offline_nodes:
start_time = time.time()
connect_to_master(node_name=node_name, master_private_url=master_private_url, work_dir=work_dir)
total_runtime_seconds = time.time() - start_time
if total_runtime_seconds > RETRY_COUNTER_RESET_TIME_SECONDS:
logging.info('Instance ran for {} seconds, resetting retry counter'.format(total_runtime_seconds))
reset = True
else:
logging.info('Unable to connect as node {}'.format(node_name))
# Rename this instance to show it was unable to connect
rename_instance('{}-unable-to-connect'.format(node_name))
time.sleep(random.randint(RETRY_SLEEP_MIN_SECONDS, RETRY_SLEEP_MAX_SECONDS))
if reset:
logging.info('Resetting repetition counter')
i = 0
rename_instance('error-too-many-attempts')
logging.fatal('Could connect to master - too many attempts')
return 1
except Exception as e:
logging.exception('Fatal exception')
logging.fatal('Fatal exception, aborting execution')
return 1