(POD-8) Add timeout_notification config to log warning before vm is destroyed.

This commit is contained in:
isaac-hammes 2023-08-22 12:09:17 -07:00
parent e6380a6e02
commit 17a1831952
7 changed files with 86 additions and 52 deletions

View file

@ -186,6 +186,7 @@ GEM
PLATFORMS
arm64-darwin-22
universal-java-11
x86_64-darwin-22
x86_64-linux
DEPENDENCIES

View file

@ -17,6 +17,7 @@
logfile: '/Users/samuel/workspace/vmpooler/vmpooler.log'
task_limit: 10
timeout: 15
timeout_notification: 5
vm_checktime: 1
vm_lifetime: 12
vm_lifetime_auth: 24
@ -38,6 +39,7 @@
datastore: 'vmstorage'
size: 5
timeout: 15
timeout_notification: 5
ready_ttl: 1440
provider: dummy
dns_plugin: 'example'
@ -48,6 +50,7 @@
datastore: 'vmstorage'
size: 5
timeout: 15
timeout_notification: 5
ready_ttl: 1440
provider: dummy
dns_plugin: 'example'
@ -58,6 +61,7 @@
datastore: 'vmstorage'
size: 5
timeout: 15
timeout_notification: 5
ready_ttl: 1440
provider: dummy
dns_plugin: 'example'
@ -67,6 +71,7 @@
datastore: 'vmstorage'
size: 5
timeout: 15
timeout_notification: 5
ready_ttl: 1440
provider: dummy
dns_plugin: 'example'
@ -77,6 +82,7 @@
datastore: 'other-vmstorage'
size: 5
timeout: 15
timeout_notification: 5
ready_ttl: 1440
provider: dummy
dns_plugin: 'example'

View file

@ -82,6 +82,7 @@ module Vmpooler
end
parsed_config[:config]['clone_target'] = ENV['CLONE_TARGET'] if ENV['CLONE_TARGET']
parsed_config[:config]['timeout'] = string_to_int(ENV['TIMEOUT']) if ENV['TIMEOUT']
parsed_config[:config]['timeout_notification'] = string_to_int(ENV['TIMEOUT_NOTIFICATION']) if ENV['TIMEOUT_NOTIFICATION']
parsed_config[:config]['vm_lifetime_auth'] = string_to_int(ENV['VM_LIFETIME_AUTH']) if ENV['VM_LIFETIME_AUTH']
parsed_config[:config]['max_tries'] = string_to_int(ENV['MAX_TRIES']) if ENV['MAX_TRIES']
parsed_config[:config]['retry_factor'] = string_to_int(ENV['RETRY_FACTOR']) if ENV['RETRY_FACTOR']

View file

@ -82,21 +82,21 @@ module Vmpooler
end
# Check the state of a VM
def check_pending_vm(vm, pool, timeout, provider)
def check_pending_vm(vm, pool, timeout, timeout_notification, provider)
Thread.new do
begin
_check_pending_vm(vm, pool, timeout, provider)
_check_pending_vm(vm, pool, timeout, timeout_notification, provider)
rescue StandardError => e
$logger.log('s', "[!] [#{pool}] '#{vm}' #{timeout} #{provider} errored while checking a pending vm : #{e}")
@redis.with_metrics do |redis|
fail_pending_vm(vm, pool, timeout, redis)
fail_pending_vm(vm, pool, timeout, timeout_notification, redis)
end
raise
end
end
end
def _check_pending_vm(vm, pool, timeout, provider)
def _check_pending_vm(vm, pool, timeout, timeout_notification, provider)
mutex = vm_mutex(vm)
return if mutex.locked?
@ -106,7 +106,7 @@ module Vmpooler
if provider.vm_ready?(pool, vm, redis)
move_pending_vm_to_ready(vm, pool, redis, request_id)
else
fail_pending_vm(vm, pool, timeout, redis)
fail_pending_vm(vm, pool, timeout, timeout_notification, redis)
end
end
end
@ -122,35 +122,53 @@ module Vmpooler
$logger.log('d', "[!] [#{pool}] '#{vm}' no longer exists. Removing from pending.")
end
def fail_pending_vm(vm, pool, timeout, redis, exists: true)
def fail_pending_vm(vm, pool, timeout, timeout_notification, redis, exists: true)
clone_stamp = redis.hget("vmpooler__vm__#{vm}", 'clone')
time_since_clone = (Time.now - Time.parse(clone_stamp)) / 60
if time_since_clone > timeout
if exists
request_id = redis.hget("vmpooler__vm__#{vm}", 'request_id')
pool_alias = redis.hget("vmpooler__vm__#{vm}", 'pool_alias') if request_id
open_socket_error = redis.hget("vmpooler__vm__#{vm}", 'open_socket_error')
redis.smove("vmpooler__pending__#{pool}", "vmpooler__completed__#{pool}", vm)
if request_id
ondemandrequest_hash = redis.hgetall("vmpooler__odrequest__#{request_id}")
if ondemandrequest_hash && ondemandrequest_hash['status'] != 'failed' && ondemandrequest_hash['status'] != 'deleted'
# will retry a VM that did not come up as vm_ready? only if it has not been market failed or deleted
redis.zadd('vmpooler__odcreate__task', 1, "#{pool_alias}:#{pool}:1:#{request_id}")
end
end
$metrics.increment("errors.markedasfailed.#{pool}")
$logger.log('d', "[!] [#{pool}] '#{vm}' marked as 'failed' after #{timeout} minutes with error: #{open_socket_error}")
else
already_timed_out = time_since_clone > timeout
timing_out_soon = time_since_clone > timeout_notification && !redis.hget("vmpooler__vm__#{vm}", 'timeout_notification')
if already_timed_out
unless exists
remove_nonexistent_vm(vm, pool, redis)
return true
end
open_socket_error = handle_timed_out_vm(vm, pool, redis)
end
redis.hset("vmpooler__vm__#{vm}", 'timeout_notification', 1) if timing_out_soon
nonexist_warning = if already_timed_out
"[!] [#{pool}] '#{vm}' marked as 'failed' after #{timeout} minutes with error: #{open_socket_error}"
elsif timing_out_soon
"[!] [#{pool}] '#{vm}' no longer exists when attempting to send notification of impending failure"
else
"[!] [#{pool}] '#{vm}' This error is wholly unexpected"
end
$logger.log('d', nonexist_warning)
true
rescue StandardError => e
$logger.log('d', "Fail pending VM failed with an error: #{e}")
false
end
def handle_timed_out_vm(vm, pool, redis)
request_id = redis.hget("vmpooler__vm__#{vm}", 'request_id')
pool_alias = redis.hget("vmpooler__vm__#{vm}", 'pool_alias') if request_id
open_socket_error = redis.hget("vmpooler__vm__#{vm}", 'open_socket_error')
redis.smove("vmpooler__pending__#{pool}", "vmpooler__completed__#{pool}", vm)
if request_id
ondemandrequest_hash = redis.hgetall("vmpooler__odrequest__#{request_id}")
if ondemandrequest_hash && ondemandrequest_hash['status'] != 'failed' && ondemandrequest_hash['status'] != 'deleted'
# will retry a VM that did not come up as vm_ready? only if it has not been market failed or deleted
redis.zadd('vmpooler__odcreate__task', 1, "#{pool_alias}:#{pool}:1:#{request_id}")
end
end
$metrics.increment("errors.markedasfailed.#{pool}")
open_socket_error
end
def move_pending_vm_to_ready(vm, pool, redis, request_id = nil)
clone_time = redis.hget("vmpooler__vm__#{vm}", 'clone')
finish = format('%<time>.2f', time: Time.now - Time.parse(clone_time))
@ -1250,19 +1268,20 @@ module Vmpooler
end
end
def check_pending_pool_vms(pool_name, provider, pool_check_response, inventory, pool_timeout)
def check_pending_pool_vms(pool_name, provider, pool_check_response, inventory, pool_timeout, pool_timeout_notification)
pool_timeout ||= $config[:config]['timeout'] || 15
pool_timeout_notification ||= $config[:config]['timeout_notification'] || 5
@redis.with_metrics do |redis|
redis.smembers("vmpooler__pending__#{pool_name}").reverse.each do |vm|
if inventory[vm]
begin
pool_check_response[:checked_pending_vms] += 1
check_pending_vm(vm, pool_name, pool_timeout, provider)
check_pending_vm(vm, pool_name, pool_timeout, pool_timeout_notification, provider)
rescue StandardError => e
$logger.log('d', "[!] [#{pool_name}] _check_pool failed with an error while evaluating pending VMs: #{e}")
end
else
fail_pending_vm(vm, pool_name, pool_timeout, redis, exists: false)
fail_pending_vm(vm, pool_name, pool_timeout, pool_timeout_notification, redis, exists: false)
end
end
end
@ -1389,7 +1408,7 @@ module Vmpooler
check_ready_pool_vms(pool['name'], provider, pool_check_response, inventory, pool['ready_ttl'] || $config[:config]['ready_ttl'])
check_pending_pool_vms(pool['name'], provider, pool_check_response, inventory, pool['timeout'])
check_pending_pool_vms(pool['name'], provider, pool_check_response, inventory, pool['timeout'], pool['timeout_notification'])
check_completed_pool_vms(pool['name'], provider, pool_check_response, inventory)

View file

@ -14,6 +14,7 @@ describe 'Pool Manager' do
let(:pool) { 'pool1' }
let(:vm) { 'vm1' }
let(:timeout) { 5 }
let(:timeout_notification) { 2 }
let(:host) { double('host') }
let(:token) { 'token1234' }
let(:request_id) { '1234' }
@ -189,9 +190,9 @@ EOT
it 'calls _check_pending_vm' do
expect(Thread).to receive(:new).and_yield
expect(subject).to receive(:_check_pending_vm).with(vm,pool,timeout,provider)
expect(subject).to receive(:_check_pending_vm).with(vm, pool, timeout, timeout_notification, provider)
subject.check_pending_vm(vm, pool, timeout, provider)
subject.check_pending_vm(vm, pool, timeout, timeout_notification, provider)
end
end
@ -208,16 +209,16 @@ EOT
expect(subject).to receive(:move_pending_vm_to_ready).with(vm, pool, redis, nil)
end
subject._check_pending_vm(vm, pool, timeout, provider)
subject._check_pending_vm(vm, pool, timeout, timeout_notification, provider)
end
it 'calls fail_pending_vm if host is not ready' do
redis_connection_pool.with do |redis|
expect(provider).to receive(:vm_ready?).with(pool, vm, redis).and_return(false)
expect(subject).to receive(:fail_pending_vm).with(vm, pool, timeout, redis)
expect(subject).to receive(:fail_pending_vm).with(vm, pool, timeout, timeout_notification, redis)
end
subject._check_pending_vm(vm, pool, timeout, provider)
subject._check_pending_vm(vm, pool, timeout, timeout_notification, provider)
end
end
@ -230,7 +231,7 @@ EOT
it 'should return' do
expect(subject).to receive(:vm_mutex).and_return(mutex)
expect(subject._check_pending_vm(vm, pool, timeout, provider)).to be_nil
expect(subject._check_pending_vm(vm, pool, timeout, timeout_notification, provider)).to be_nil
end
end
end
@ -274,14 +275,14 @@ EOT
it 'takes no action if VM is not cloning' do
redis_connection_pool.with do |redis|
expect(subject.fail_pending_vm(vm, pool, timeout, redis)).to eq(true)
expect(subject.fail_pending_vm(vm, pool, timeout, timeout_notification, redis)).to eq(true)
end
end
it 'takes no action if VM is within timeout' do
redis_connection_pool.with do |redis|
redis.hset("vmpooler__vm__#{vm}", 'clone',Time.now.to_s)
expect(subject.fail_pending_vm(vm, pool, timeout, redis)).to eq(true)
expect(subject.fail_pending_vm(vm, pool, timeout, timeout_notification, redis)).to eq(true)
expect(redis.sismember("vmpooler__pending__#{pool}", vm)).to be(true)
end
end
@ -289,7 +290,7 @@ EOT
it 'moves VM to completed queue if VM has exceeded timeout and exists' do
redis_connection_pool.with do |redis|
redis.hset("vmpooler__vm__#{vm}", 'clone',Date.new(2001,1,1).to_s)
expect(subject.fail_pending_vm(vm, pool, timeout, redis, exists: true)).to eq(true)
expect(subject.fail_pending_vm(vm, pool, timeout, timeout_notification, redis, exists: true)).to eq(true)
expect(redis.sismember("vmpooler__pending__#{pool}", vm)).to be(false)
expect(redis.sismember("vmpooler__completed__#{pool}", vm)).to be(true)
end
@ -299,7 +300,7 @@ EOT
redis_connection_pool.with do |redis|
redis.hset("vmpooler__vm__#{vm}", 'clone',Date.new(2001,1,1).to_s)
expect(logger).to receive(:log).with('d', "[!] [#{pool}] '#{vm}' marked as 'failed' after #{timeout} minutes with error: ")
expect(subject.fail_pending_vm(vm, pool, timeout, redis, exists: true)).to eq(true)
expect(subject.fail_pending_vm(vm, pool, timeout, timeout_notification, redis, exists: true)).to eq(true)
end
end
@ -307,14 +308,14 @@ EOT
redis_connection_pool.with do |redis|
redis.hset("vmpooler__vm__#{vm}", 'clone',Date.new(2001,1,1).to_s)
expect(subject).to receive(:remove_nonexistent_vm).with(vm, pool, redis)
expect(subject.fail_pending_vm(vm, pool, timeout, redis, exists: false)).to eq(true)
expect(subject.fail_pending_vm(vm, pool, timeout, timeout_notification, redis, exists: false)).to eq(true)
end
end
it 'swallows error if an error is raised' do
redis_connection_pool.with do |redis|
redis.hset("vmpooler__vm__#{vm}", 'clone','iamnotparsable_asdate')
expect(subject.fail_pending_vm(vm, pool, timeout, redis, exists: true)).to eq(false)
expect(subject.fail_pending_vm(vm, pool, timeout, timeout_notification, redis, exists: true)).to eq(false)
end
end
@ -323,7 +324,7 @@ EOT
redis.hset("vmpooler__vm__#{vm}", 'clone','iamnotparsable_asdate')
expect(logger).to receive(:log).with('d', String)
subject.fail_pending_vm(vm, pool, timeout, redis, exists: true)
subject.fail_pending_vm(vm, pool, timeout, timeout_notification, redis, exists: true)
end
end
@ -334,7 +335,7 @@ EOT
redis.hset("vmpooler__vm__#{vm}", 'clone',(Time.now - 900).to_s)
redis.hset("vmpooler__vm__#{vm}", 'pool_alias', pool)
redis.hset("vmpooler__vm__#{vm}", 'request_id', request_id)
subject.fail_pending_vm(vm, pool, timeout, redis, exists: true)
subject.fail_pending_vm(vm, pool, timeout, timeout_notification, redis, exists: true)
expect(redis.zrange('vmpooler__odcreate__task', 0, -1)).to eq(["#{pool}:#{pool}:1:#{request_id}"])
end
end
@ -4125,10 +4126,10 @@ EOT
it 'should call fail_pending_vm' do
redis_connection_pool.with do |redis|
expect(subject).to receive(:fail_pending_vm).with(vm, pool, Integer, redis, exists: false)
expect(subject).to receive(:fail_pending_vm).with(vm, pool, Integer, Integer, redis, exists: false)
end
subject.check_pending_pool_vms(pool, provider, pool_check_response, inventory, timeout)
subject.check_pending_pool_vms(pool, provider, pool_check_response, inventory, timeout, timeout_notification)
end
end
@ -4146,7 +4147,7 @@ EOT
it 'should return the number of checked pending VMs' do
allow(subject).to receive(:check_pending_vm)
subject.check_pending_pool_vms(pool, provider, pool_check_response, inventory, timeout)
subject.check_pending_pool_vms(pool, provider, pool_check_response, inventory, timeout, timeout_notification)
expect(pool_check_response[:checked_pending_vms]).to be(1)
end
@ -4155,31 +4156,31 @@ EOT
expect(subject).to receive(:check_pending_vm).and_raise(RuntimeError,'MockError')
expect(logger).to receive(:log).with('d', "[!] [#{pool}] _check_pool failed with an error while evaluating pending VMs: MockError")
subject.check_pending_pool_vms(pool, provider, pool_check_response, inventory, timeout)
subject.check_pending_pool_vms(pool, provider, pool_check_response, inventory, timeout, timeout_notification)
end
it 'should use the pool timeout if set' do
big_lifetime = 2000
config[:pools][0]['timeout'] = big_lifetime
expect(subject).to receive(:check_pending_vm).with(vm,pool,big_lifetime,provider)
expect(subject).to receive(:check_pending_vm).with(vm,pool, big_lifetime, timeout_notification, provider)
subject.check_pending_pool_vms(pool, provider, pool_check_response, inventory, big_lifetime)
subject.check_pending_pool_vms(pool, provider, pool_check_response, inventory, big_lifetime, timeout_notification)
end
it 'should use the configuration setting if the pool timeout is not set' do
big_lifetime = 2000
config[:config]['timeout'] = big_lifetime
expect(subject).to receive(:check_pending_vm).with(vm,pool,big_lifetime,provider)
expect(subject).to receive(:check_pending_vm).with(vm, pool, big_lifetime, timeout_notification, provider)
subject.check_pending_pool_vms(pool, provider, pool_check_response, inventory, big_lifetime)
subject.check_pending_pool_vms(pool, provider, pool_check_response, inventory, big_lifetime, timeout_notification)
end
it 'should use a pool timeout of 15 if nothing is set' do
expect(subject).to receive(:check_pending_vm).with(vm,pool,timeout,provider)
expect(subject).to receive(:check_pending_vm).with(vm, pool, timeout, timeout_notification, provider)
subject.check_pending_pool_vms(pool, provider, pool_check_response, inventory, timeout)
subject.check_pending_pool_vms(pool, provider, pool_check_response, inventory, timeout, timeout_notification)
end
end
end
@ -4841,7 +4842,7 @@ EOT
it 'should call #check_ready_pool_vms' do
allow(subject).to receive(:create_inventory).and_return({})
expect(subject).to receive(:check_pending_pool_vms).with(pool, provider, pool_check_response, {}, pool_object['timeout'])
expect(subject).to receive(:check_pending_pool_vms).with(pool, provider, pool_check_response, {}, pool_object['timeout'], pool_object['timeout_notification'])
subject._check_pool(pool_object,provider)
end

View file

@ -23,6 +23,7 @@
logfile: '/var/log/vmpooler.log'
task_limit: 10
timeout: 15
timeout_notification: 5
vm_checktime: 1
vm_lifetime: 12
vm_lifetime_auth: 24
@ -39,6 +40,7 @@
datastore: 'vmstorage'
size: 5
timeout: 15
timeout_notification: 5
ready_ttl: 1440
provider: dummy
dns_plugin: 'example'
@ -49,6 +51,7 @@
datastore: 'vmstorage'
size: 5
timeout: 15
timeout_notification: 5
ready_ttl: 1440
provider: dummy
dns_plugin: 'example'

View file

@ -615,6 +615,7 @@
logfile: '/var/log/vmpooler.log'
task_limit: 10
timeout: 15
timeout_notification: 5
vm_checktime: 1
vm_lifetime: 12
vm_lifetime_auth: 24
@ -737,6 +738,7 @@
datastore: 'vmstorage'
size: 5
timeout: 15
timeout_notification: 5
ready_ttl: 1440
provider: vsphere
create_linked_clone: true
@ -747,6 +749,7 @@
datastore: 'vmstorage'
size: 5
timeout: 15
timeout_notification: 5
ready_ttl: 1440
provider: vsphere
create_linked_clone: false