From 17a1831952d70b95cb458e95e5515d9790a0a6b5 Mon Sep 17 00:00:00 2001 From: isaac-hammes Date: Tue, 22 Aug 2023 12:09:17 -0700 Subject: [PATCH] (POD-8) Add timeout_notification config to log warning before vm is destroyed. --- Gemfile.lock | 1 + .../vmpooler.yaml.dummy-example.aliasedpools | 6 ++ lib/vmpooler.rb | 1 + lib/vmpooler/pool_manager.rb | 73 ++++++++++++------- spec/unit/pool_manager_spec.rb | 51 ++++++------- vmpooler.yaml.dummy-example | 3 + vmpooler.yaml.example | 3 + 7 files changed, 86 insertions(+), 52 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index 0d89963..4a94d9c 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -186,6 +186,7 @@ GEM PLATFORMS arm64-darwin-22 universal-java-11 + x86_64-darwin-22 x86_64-linux DEPENDENCIES diff --git a/examples/vmpooler.yaml.dummy-example.aliasedpools b/examples/vmpooler.yaml.dummy-example.aliasedpools index ebece50..55bf9ff 100644 --- a/examples/vmpooler.yaml.dummy-example.aliasedpools +++ b/examples/vmpooler.yaml.dummy-example.aliasedpools @@ -17,6 +17,7 @@ logfile: '/Users/samuel/workspace/vmpooler/vmpooler.log' task_limit: 10 timeout: 15 + timeout_notification: 5 vm_checktime: 1 vm_lifetime: 12 vm_lifetime_auth: 24 @@ -38,6 +39,7 @@ datastore: 'vmstorage' size: 5 timeout: 15 + timeout_notification: 5 ready_ttl: 1440 provider: dummy dns_plugin: 'example' @@ -48,6 +50,7 @@ datastore: 'vmstorage' size: 5 timeout: 15 + timeout_notification: 5 ready_ttl: 1440 provider: dummy dns_plugin: 'example' @@ -58,6 +61,7 @@ datastore: 'vmstorage' size: 5 timeout: 15 + timeout_notification: 5 ready_ttl: 1440 provider: dummy dns_plugin: 'example' @@ -67,6 +71,7 @@ datastore: 'vmstorage' size: 5 timeout: 15 + timeout_notification: 5 ready_ttl: 1440 provider: dummy dns_plugin: 'example' @@ -77,6 +82,7 @@ datastore: 'other-vmstorage' size: 5 timeout: 15 + timeout_notification: 5 ready_ttl: 1440 provider: dummy dns_plugin: 'example' diff --git a/lib/vmpooler.rb b/lib/vmpooler.rb index 985a72b..2fcde30 100644 --- a/lib/vmpooler.rb +++ b/lib/vmpooler.rb @@ -82,6 +82,7 @@ module Vmpooler end parsed_config[:config]['clone_target'] = ENV['CLONE_TARGET'] if ENV['CLONE_TARGET'] parsed_config[:config]['timeout'] = string_to_int(ENV['TIMEOUT']) if ENV['TIMEOUT'] + parsed_config[:config]['timeout_notification'] = string_to_int(ENV['TIMEOUT_NOTIFICATION']) if ENV['TIMEOUT_NOTIFICATION'] parsed_config[:config]['vm_lifetime_auth'] = string_to_int(ENV['VM_LIFETIME_AUTH']) if ENV['VM_LIFETIME_AUTH'] parsed_config[:config]['max_tries'] = string_to_int(ENV['MAX_TRIES']) if ENV['MAX_TRIES'] parsed_config[:config]['retry_factor'] = string_to_int(ENV['RETRY_FACTOR']) if ENV['RETRY_FACTOR'] diff --git a/lib/vmpooler/pool_manager.rb b/lib/vmpooler/pool_manager.rb index d6735c1..d16db1a 100644 --- a/lib/vmpooler/pool_manager.rb +++ b/lib/vmpooler/pool_manager.rb @@ -82,21 +82,21 @@ module Vmpooler end # Check the state of a VM - def check_pending_vm(vm, pool, timeout, provider) + def check_pending_vm(vm, pool, timeout, timeout_notification, provider) Thread.new do begin - _check_pending_vm(vm, pool, timeout, provider) + _check_pending_vm(vm, pool, timeout, timeout_notification, provider) rescue StandardError => e $logger.log('s', "[!] [#{pool}] '#{vm}' #{timeout} #{provider} errored while checking a pending vm : #{e}") @redis.with_metrics do |redis| - fail_pending_vm(vm, pool, timeout, redis) + fail_pending_vm(vm, pool, timeout, timeout_notification, redis) end raise end end end - def _check_pending_vm(vm, pool, timeout, provider) + def _check_pending_vm(vm, pool, timeout, timeout_notification, provider) mutex = vm_mutex(vm) return if mutex.locked? @@ -106,7 +106,7 @@ module Vmpooler if provider.vm_ready?(pool, vm, redis) move_pending_vm_to_ready(vm, pool, redis, request_id) else - fail_pending_vm(vm, pool, timeout, redis) + fail_pending_vm(vm, pool, timeout, timeout_notification, redis) end end end @@ -122,35 +122,53 @@ module Vmpooler $logger.log('d', "[!] [#{pool}] '#{vm}' no longer exists. Removing from pending.") end - def fail_pending_vm(vm, pool, timeout, redis, exists: true) + def fail_pending_vm(vm, pool, timeout, timeout_notification, redis, exists: true) clone_stamp = redis.hget("vmpooler__vm__#{vm}", 'clone') - time_since_clone = (Time.now - Time.parse(clone_stamp)) / 60 - if time_since_clone > timeout - if exists - request_id = redis.hget("vmpooler__vm__#{vm}", 'request_id') - pool_alias = redis.hget("vmpooler__vm__#{vm}", 'pool_alias') if request_id - open_socket_error = redis.hget("vmpooler__vm__#{vm}", 'open_socket_error') - redis.smove("vmpooler__pending__#{pool}", "vmpooler__completed__#{pool}", vm) - if request_id - ondemandrequest_hash = redis.hgetall("vmpooler__odrequest__#{request_id}") - if ondemandrequest_hash && ondemandrequest_hash['status'] != 'failed' && ondemandrequest_hash['status'] != 'deleted' - # will retry a VM that did not come up as vm_ready? only if it has not been market failed or deleted - redis.zadd('vmpooler__odcreate__task', 1, "#{pool_alias}:#{pool}:1:#{request_id}") - end - end - $metrics.increment("errors.markedasfailed.#{pool}") - $logger.log('d', "[!] [#{pool}] '#{vm}' marked as 'failed' after #{timeout} minutes with error: #{open_socket_error}") - else + + already_timed_out = time_since_clone > timeout + timing_out_soon = time_since_clone > timeout_notification && !redis.hget("vmpooler__vm__#{vm}", 'timeout_notification') + + if already_timed_out + unless exists remove_nonexistent_vm(vm, pool, redis) + return true end + open_socket_error = handle_timed_out_vm(vm, pool, redis) end + + redis.hset("vmpooler__vm__#{vm}", 'timeout_notification', 1) if timing_out_soon + + nonexist_warning = if already_timed_out + "[!] [#{pool}] '#{vm}' marked as 'failed' after #{timeout} minutes with error: #{open_socket_error}" + elsif timing_out_soon + "[!] [#{pool}] '#{vm}' no longer exists when attempting to send notification of impending failure" + else + "[!] [#{pool}] '#{vm}' This error is wholly unexpected" + end + $logger.log('d', nonexist_warning) true rescue StandardError => e $logger.log('d', "Fail pending VM failed with an error: #{e}") false end + def handle_timed_out_vm(vm, pool, redis) + request_id = redis.hget("vmpooler__vm__#{vm}", 'request_id') + pool_alias = redis.hget("vmpooler__vm__#{vm}", 'pool_alias') if request_id + open_socket_error = redis.hget("vmpooler__vm__#{vm}", 'open_socket_error') + redis.smove("vmpooler__pending__#{pool}", "vmpooler__completed__#{pool}", vm) + if request_id + ondemandrequest_hash = redis.hgetall("vmpooler__odrequest__#{request_id}") + if ondemandrequest_hash && ondemandrequest_hash['status'] != 'failed' && ondemandrequest_hash['status'] != 'deleted' + # will retry a VM that did not come up as vm_ready? only if it has not been market failed or deleted + redis.zadd('vmpooler__odcreate__task', 1, "#{pool_alias}:#{pool}:1:#{request_id}") + end + end + $metrics.increment("errors.markedasfailed.#{pool}") + open_socket_error + end + def move_pending_vm_to_ready(vm, pool, redis, request_id = nil) clone_time = redis.hget("vmpooler__vm__#{vm}", 'clone') finish = format('%