mirror of
https://github.com/puppetlabs/vmpooler.git
synced 2026-01-26 01:58:41 -05:00
Merge branch 'main' into P4DEVOPS-8567
This commit is contained in:
commit
c24fe28d6d
3 changed files with 216 additions and 4 deletions
|
|
@ -168,16 +168,73 @@ module Vmpooler
|
||||||
open_socket_error || 'VM timed out during pending phase',
|
open_socket_error || 'VM timed out during pending phase',
|
||||||
redis, request_id: request_id, pool_alias: pool_alias, retry_count: retry_count)
|
redis, request_id: request_id, pool_alias: pool_alias, retry_count: retry_count)
|
||||||
|
|
||||||
|
clone_error = redis.hget("vmpooler__vm__#{vm}", 'clone_error')
|
||||||
|
clone_error_class = redis.hget("vmpooler__vm__#{vm}", 'clone_error_class')
|
||||||
redis.smove("vmpooler__pending__#{pool}", "vmpooler__completed__#{pool}", vm)
|
redis.smove("vmpooler__pending__#{pool}", "vmpooler__completed__#{pool}", vm)
|
||||||
|
|
||||||
if request_id
|
if request_id
|
||||||
ondemandrequest_hash = redis.hgetall("vmpooler__odrequest__#{request_id}")
|
ondemandrequest_hash = redis.hgetall("vmpooler__odrequest__#{request_id}")
|
||||||
if ondemandrequest_hash && ondemandrequest_hash['status'] != 'failed' && ondemandrequest_hash['status'] != 'deleted'
|
if ondemandrequest_hash && ondemandrequest_hash['status'] != 'failed' && ondemandrequest_hash['status'] != 'deleted'
|
||||||
# will retry a VM that did not come up as vm_ready? only if it has not been market failed or deleted
|
# Check retry count and max retry limit before retrying
|
||||||
|
retry_count = (redis.hget("vmpooler__odrequest__#{request_id}", 'retry_count') || '0').to_i
|
||||||
|
max_retries = $config[:config]['max_vm_retries'] || 3
|
||||||
|
|
||||||
|
$logger.log('s', "[!] [#{pool}] '#{vm}' checking retry logic: error='#{clone_error}', error_class='#{clone_error_class}', retry_count=#{retry_count}, max_retries=#{max_retries}")
|
||||||
|
|
||||||
|
# Determine if error is likely permanent (configuration issues)
|
||||||
|
permanent_error = permanent_error?(clone_error, clone_error_class)
|
||||||
|
$logger.log('s', "[!] [#{pool}] '#{vm}' permanent_error check result: #{permanent_error}")
|
||||||
|
|
||||||
|
if retry_count < max_retries && !permanent_error
|
||||||
|
# Increment retry count and retry VM creation
|
||||||
|
redis.hset("vmpooler__odrequest__#{request_id}", 'retry_count', retry_count + 1)
|
||||||
redis.zadd('vmpooler__odcreate__task', 1, "#{pool_alias}:#{pool}:1:#{request_id}")
|
redis.zadd('vmpooler__odcreate__task', 1, "#{pool_alias}:#{pool}:1:#{request_id}")
|
||||||
|
$logger.log('s', "[!] [#{pool}] '#{vm}' failed, retrying (attempt #{retry_count + 1}/#{max_retries})")
|
||||||
|
else
|
||||||
|
# Max retries exceeded or permanent error, mark request as permanently failed
|
||||||
|
failure_reason = if permanent_error
|
||||||
|
"Configuration error: #{clone_error}"
|
||||||
|
else
|
||||||
|
'Max retry attempts exceeded'
|
||||||
|
end
|
||||||
|
redis.hset("vmpooler__odrequest__#{request_id}", 'status', 'failed')
|
||||||
|
redis.hset("vmpooler__odrequest__#{request_id}", 'failure_reason', failure_reason)
|
||||||
|
$logger.log('s', "[!] [#{pool}] '#{vm}' permanently failed: #{failure_reason}")
|
||||||
|
$metrics.increment("errors.permanently_failed.#{pool}")
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
$metrics.increment("errors.markedasfailed.#{pool}")
|
$metrics.increment("errors.markedasfailed.#{pool}")
|
||||||
open_socket_error
|
open_socket_error || clone_error
|
||||||
|
end
|
||||||
|
|
||||||
|
# Determine if an error is likely permanent (configuration issue) vs transient
|
||||||
|
def permanent_error?(error_message, error_class)
|
||||||
|
return false if error_message.nil? || error_class.nil?
|
||||||
|
|
||||||
|
permanent_error_patterns = [
|
||||||
|
/template.*not found/i,
|
||||||
|
/template.*does not exist/i,
|
||||||
|
/invalid.*path/i,
|
||||||
|
/folder.*not found/i,
|
||||||
|
/datastore.*not found/i,
|
||||||
|
/resource pool.*not found/i,
|
||||||
|
/permission.*denied/i,
|
||||||
|
/authentication.*failed/i,
|
||||||
|
/invalid.*credentials/i,
|
||||||
|
/configuration.*error/i
|
||||||
|
]
|
||||||
|
|
||||||
|
permanent_error_classes = [
|
||||||
|
'ArgumentError',
|
||||||
|
'NoMethodError',
|
||||||
|
'NameError'
|
||||||
|
]
|
||||||
|
|
||||||
|
# Check error message patterns
|
||||||
|
permanent_error_patterns.any? { |pattern| error_message.match?(pattern) } ||
|
||||||
|
# Check error class types
|
||||||
|
permanent_error_classes.include?(error_class)
|
||||||
end
|
end
|
||||||
|
|
||||||
def move_pending_vm_to_ready(vm, pool, redis, request_id = nil)
|
def move_pending_vm_to_ready(vm, pool, redis, request_id = nil)
|
||||||
|
|
@ -435,7 +492,13 @@ module Vmpooler
|
||||||
if request_id
|
if request_id
|
||||||
$logger.log('s', "[!] [#{pool_name}] failed while cloning VM for request #{request_id} with an error: #{e}")
|
$logger.log('s', "[!] [#{pool_name}] failed while cloning VM for request #{request_id} with an error: #{e}")
|
||||||
@redis.with_metrics do |redis|
|
@redis.with_metrics do |redis|
|
||||||
|
# Only re-queue if the request wasn't already marked as failed (e.g., by permanent error detection)
|
||||||
|
request_status = redis.hget("vmpooler__odrequest__#{request_id}", 'status')
|
||||||
|
if request_status != 'failed'
|
||||||
redis.zadd('vmpooler__odcreate__task', 1, "#{pool_alias}:#{pool_name}:1:#{request_id}")
|
redis.zadd('vmpooler__odcreate__task', 1, "#{pool_alias}:#{pool_name}:1:#{request_id}")
|
||||||
|
else
|
||||||
|
$logger.log('s', "[!] [#{pool_name}] Request #{request_id} already marked as failed, not re-queueing")
|
||||||
|
end
|
||||||
end
|
end
|
||||||
else
|
else
|
||||||
$logger.log('s', "[!] [#{pool_name}] failed while cloning VM with an error: #{e}")
|
$logger.log('s', "[!] [#{pool_name}] failed while cloning VM with an error: #{e}")
|
||||||
|
|
@ -559,6 +622,7 @@ module Vmpooler
|
||||||
dns_plugin_class_name = get_dns_plugin_class_name_for_pool(pool_name)
|
dns_plugin_class_name = get_dns_plugin_class_name_for_pool(pool_name)
|
||||||
dns_plugin.create_or_replace_record(new_vmname) unless dns_plugin_class_name == 'dynamic-dns'
|
dns_plugin.create_or_replace_record(new_vmname) unless dns_plugin_class_name == 'dynamic-dns'
|
||||||
rescue StandardError => e
|
rescue StandardError => e
|
||||||
|
# Store error details for retry decision making
|
||||||
@redis.with_metrics do |redis|
|
@redis.with_metrics do |redis|
|
||||||
# Get retry count before moving to DLQ
|
# Get retry count before moving to DLQ
|
||||||
retry_count = 0
|
retry_count = 0
|
||||||
|
|
@ -573,10 +637,34 @@ module Vmpooler
|
||||||
|
|
||||||
redis.pipelined do |pipeline|
|
redis.pipelined do |pipeline|
|
||||||
pipeline.srem("vmpooler__pending__#{pool_name}", new_vmname)
|
pipeline.srem("vmpooler__pending__#{pool_name}", new_vmname)
|
||||||
|
pipeline.hset("vmpooler__vm__#{new_vmname}", 'clone_error', e.message)
|
||||||
|
pipeline.hset("vmpooler__vm__#{new_vmname}", 'clone_error_class', e.class.name)
|
||||||
expiration_ttl = $config[:redis]['data_ttl'].to_i * 60 * 60
|
expiration_ttl = $config[:redis]['data_ttl'].to_i * 60 * 60
|
||||||
pipeline.expire("vmpooler__vm__#{new_vmname}", expiration_ttl)
|
pipeline.expire("vmpooler__vm__#{new_vmname}", expiration_ttl)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Handle retry logic for on-demand requests
|
||||||
|
if request_id
|
||||||
|
retry_count = (redis.hget("vmpooler__odrequest__#{request_id}", 'retry_count') || '0').to_i
|
||||||
|
max_retries = $config[:config]['max_vm_retries'] || 3
|
||||||
|
is_permanent = permanent_error?(e.message, e.class.name)
|
||||||
|
|
||||||
|
$logger.log('s', "[!] [#{pool_name}] '#{new_vmname}' checking immediate failure retry: error='#{e.message}', error_class='#{e.class.name}', retry_count=#{retry_count}, max_retries=#{max_retries}, permanent_error=#{is_permanent}")
|
||||||
|
|
||||||
|
if is_permanent || retry_count >= max_retries
|
||||||
|
reason = is_permanent ? 'permanent error detected' : 'max retries exceeded'
|
||||||
|
$logger.log('s', "[!] [#{pool_name}] Cancelling request #{request_id} due to #{reason}")
|
||||||
|
redis.hset("vmpooler__odrequest__#{request_id}", 'status', 'failed')
|
||||||
|
redis.zadd('vmpooler__odcreate__task', 0, "#{pool_alias}:#{pool_name}:0:#{request_id}")
|
||||||
|
else
|
||||||
|
# Increment retry count and re-queue for retry
|
||||||
|
redis.hincrby("vmpooler__odrequest__#{request_id}", 'retry_count', 1)
|
||||||
|
$logger.log('s', "[+] [#{pool_name}] Request #{request_id} will be retried (attempt #{retry_count + 1}/#{max_retries})")
|
||||||
|
redis.zadd('vmpooler__odcreate__task', 1, "#{pool_alias}:#{pool_name}:1:#{request_id}")
|
||||||
end
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
$logger.log('s', "[!] [#{pool_name}] '#{new_vmname}' clone failed: #{e.class}: #{e.message}")
|
||||||
raise
|
raise
|
||||||
ensure
|
ensure
|
||||||
@redis.with_metrics do |redis|
|
@redis.with_metrics do |redis|
|
||||||
|
|
|
||||||
|
|
@ -345,6 +345,123 @@ EOT
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
describe '#handle_timed_out_vm' do
|
||||||
|
before do
|
||||||
|
expect(subject).not_to be_nil
|
||||||
|
end
|
||||||
|
|
||||||
|
before(:each) do
|
||||||
|
redis_connection_pool.with do |redis|
|
||||||
|
create_pending_vm(pool, vm, redis)
|
||||||
|
config[:config]['max_vm_retries'] = 3
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
context 'without request_id' do
|
||||||
|
it 'moves VM to completed queue and returns error' do
|
||||||
|
redis_connection_pool.with do |redis|
|
||||||
|
redis.hset("vmpooler__vm__#{vm}", 'open_socket_error', 'connection failed')
|
||||||
|
result = subject.handle_timed_out_vm(vm, pool, redis)
|
||||||
|
|
||||||
|
expect(redis.sismember("vmpooler__pending__#{pool}", vm)).to be(false)
|
||||||
|
expect(redis.sismember("vmpooler__completed__#{pool}", vm)).to be(true)
|
||||||
|
expect(result).to eq('connection failed')
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
context 'with request_id and transient error' do
|
||||||
|
before(:each) do
|
||||||
|
redis_connection_pool.with do |redis|
|
||||||
|
redis.hset("vmpooler__vm__#{vm}", 'request_id', request_id)
|
||||||
|
redis.hset("vmpooler__vm__#{vm}", 'pool_alias', pool)
|
||||||
|
redis.hset("vmpooler__odrequest__#{request_id}", 'status', 'pending')
|
||||||
|
redis.hset("vmpooler__vm__#{vm}", 'clone_error', 'network timeout')
|
||||||
|
redis.hset("vmpooler__vm__#{vm}", 'clone_error_class', 'Timeout::Error')
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'retries on first failure' do
|
||||||
|
redis_connection_pool.with do |redis|
|
||||||
|
subject.handle_timed_out_vm(vm, pool, redis)
|
||||||
|
|
||||||
|
expect(redis.hget("vmpooler__odrequest__#{request_id}", 'retry_count')).to eq('1')
|
||||||
|
expect(redis.zrange('vmpooler__odcreate__task', 0, -1)).to include("#{pool}:#{pool}:1:#{request_id}")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'marks as failed after max retries' do
|
||||||
|
redis_connection_pool.with do |redis|
|
||||||
|
redis.hset("vmpooler__odrequest__#{request_id}", 'retry_count', '3')
|
||||||
|
|
||||||
|
subject.handle_timed_out_vm(vm, pool, redis)
|
||||||
|
|
||||||
|
expect(redis.hget("vmpooler__odrequest__#{request_id}", 'status')).to eq('failed')
|
||||||
|
expect(redis.hget("vmpooler__odrequest__#{request_id}", 'failure_reason')).to eq('Max retry attempts exceeded')
|
||||||
|
expect(redis.zrange('vmpooler__odcreate__task', 0, -1)).not_to include("#{pool}:#{pool}:1:#{request_id}")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
context 'with request_id and permanent error' do
|
||||||
|
before(:each) do
|
||||||
|
redis_connection_pool.with do |redis|
|
||||||
|
redis.hset("vmpooler__vm__#{vm}", 'request_id', request_id)
|
||||||
|
redis.hset("vmpooler__vm__#{vm}", 'pool_alias', pool)
|
||||||
|
redis.hset("vmpooler__odrequest__#{request_id}", 'status', 'pending')
|
||||||
|
redis.hset("vmpooler__vm__#{vm}", 'clone_error', 'template not found')
|
||||||
|
redis.hset("vmpooler__vm__#{vm}", 'clone_error_class', 'RuntimeError')
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'immediately marks as failed without retrying' do
|
||||||
|
redis_connection_pool.with do |redis|
|
||||||
|
subject.handle_timed_out_vm(vm, pool, redis)
|
||||||
|
|
||||||
|
expect(redis.hget("vmpooler__odrequest__#{request_id}", 'status')).to eq('failed')
|
||||||
|
expect(redis.hget("vmpooler__odrequest__#{request_id}", 'failure_reason')).to include('Configuration error')
|
||||||
|
expect(redis.zrange('vmpooler__odcreate__task', 0, -1)).not_to include("#{pool}:#{pool}:1:#{request_id}")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
describe '#permanent_error?' do
|
||||||
|
before do
|
||||||
|
expect(subject).not_to be_nil
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'identifies template not found errors as permanent' do
|
||||||
|
expect(subject.permanent_error?('template not found', 'RuntimeError')).to be(true)
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'identifies invalid path errors as permanent' do
|
||||||
|
expect(subject.permanent_error?('invalid path specified', 'ArgumentError')).to be(true)
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'identifies permission denied errors as permanent' do
|
||||||
|
expect(subject.permanent_error?('permission denied', 'SecurityError')).to be(true)
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'identifies ArgumentError class as permanent' do
|
||||||
|
expect(subject.permanent_error?('some argument error', 'ArgumentError')).to be(true)
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'identifies network errors as transient' do
|
||||||
|
expect(subject.permanent_error?('connection timeout', 'Timeout::Error')).to be(false)
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'identifies socket errors as transient' do
|
||||||
|
expect(subject.permanent_error?('connection refused', 'Errno::ECONNREFUSED')).to be(false)
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'returns false for nil inputs' do
|
||||||
|
expect(subject.permanent_error?(nil, nil)).to be(false)
|
||||||
|
expect(subject.permanent_error?('error', nil)).to be(false)
|
||||||
|
expect(subject.permanent_error?(nil, 'Error')).to be(false)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
describe '#move_pending_vm_to_ready' do
|
describe '#move_pending_vm_to_ready' do
|
||||||
let(:host) { { 'hostname' => vm }}
|
let(:host) { { 'hostname' => vm }}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -456,6 +456,12 @@
|
||||||
# How long (in minutes) before marking a clone in 'pending' queues as 'failed' and retrying.
|
# How long (in minutes) before marking a clone in 'pending' queues as 'failed' and retrying.
|
||||||
# (default: 15)
|
# (default: 15)
|
||||||
#
|
#
|
||||||
|
# - max_vm_retries
|
||||||
|
# Maximum number of times to retry VM creation for a failed request before marking it as permanently failed.
|
||||||
|
# This helps prevent infinite retry loops when there are configuration issues like invalid template paths.
|
||||||
|
# Permanent errors (like invalid template paths) are detected and will not be retried.
|
||||||
|
# (default: 3)
|
||||||
|
#
|
||||||
# - vm_checktime
|
# - vm_checktime
|
||||||
# How often (in minutes) to check the sanity of VMs in 'ready' queues.
|
# How often (in minutes) to check the sanity of VMs in 'ready' queues.
|
||||||
# (default: 1)
|
# (default: 1)
|
||||||
|
|
@ -619,6 +625,7 @@
|
||||||
vm_checktime: 1
|
vm_checktime: 1
|
||||||
vm_lifetime: 12
|
vm_lifetime: 12
|
||||||
vm_lifetime_auth: 24
|
vm_lifetime_auth: 24
|
||||||
|
max_vm_retries: 3
|
||||||
allowed_tags:
|
allowed_tags:
|
||||||
- 'created_by'
|
- 'created_by'
|
||||||
- 'project'
|
- 'project'
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue