From f290c6806e7e1b22555b99a39628643447096285 Mon Sep 17 00:00:00 2001 From: Mahima Singh <105724608+smahima27@users.noreply.github.com> Date: Thu, 4 Dec 2025 16:05:07 +0530 Subject: [PATCH 1/6] Implement request cancellation handling to prevent unnecessary VM spin-up --- Gemfile.lock | 1 + lib/vmpooler/pool_manager.rb | 66 +++++++++++++++++-- spec/unit/pool_manager_spec.rb | 117 +++++++++++++++++++++++++++++++++ vmpooler.yaml.example | 7 ++ 4 files changed, 187 insertions(+), 4 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index cfb545a..418f24d 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -196,6 +196,7 @@ GEM PLATFORMS arm64-darwin-22 + arm64-darwin-23 universal-java-11 universal-java-17 x86_64-darwin-22 diff --git a/lib/vmpooler/pool_manager.rb b/lib/vmpooler/pool_manager.rb index ce3028b..d8aea0d 100644 --- a/lib/vmpooler/pool_manager.rb +++ b/lib/vmpooler/pool_manager.rb @@ -161,16 +161,70 @@ module Vmpooler request_id = redis.hget("vmpooler__vm__#{vm}", 'request_id') pool_alias = redis.hget("vmpooler__vm__#{vm}", 'pool_alias') if request_id open_socket_error = redis.hget("vmpooler__vm__#{vm}", 'open_socket_error') + clone_error = redis.hget("vmpooler__vm__#{vm}", 'clone_error') + clone_error_class = redis.hget("vmpooler__vm__#{vm}", 'clone_error_class') redis.smove("vmpooler__pending__#{pool}", "vmpooler__completed__#{pool}", vm) + if request_id ondemandrequest_hash = redis.hgetall("vmpooler__odrequest__#{request_id}") if ondemandrequest_hash && ondemandrequest_hash['status'] != 'failed' && ondemandrequest_hash['status'] != 'deleted' - # will retry a VM that did not come up as vm_ready? only if it has not been market failed or deleted - redis.zadd('vmpooler__odcreate__task', 1, "#{pool_alias}:#{pool}:1:#{request_id}") + # Check retry count and max retry limit before retrying + retry_count = (redis.hget("vmpooler__odrequest__#{request_id}", 'retry_count') || '0').to_i + max_retries = $config[:config]['max_vm_retries'] || 3 + + # Determine if error is likely permanent (configuration issues) + permanent_error = is_permanent_error?(clone_error, clone_error_class) + + if retry_count < max_retries && !permanent_error + # Increment retry count and retry VM creation + redis.hset("vmpooler__odrequest__#{request_id}", 'retry_count', retry_count + 1) + redis.zadd('vmpooler__odcreate__task', 1, "#{pool_alias}:#{pool}:1:#{request_id}") + $logger.log('s', "[!] [#{pool}] '#{vm}' failed, retrying (attempt #{retry_count + 1}/#{max_retries})") + else + # Max retries exceeded or permanent error, mark request as permanently failed + failure_reason = if permanent_error + "Configuration error: #{clone_error}" + else + 'Max retry attempts exceeded' + end + redis.hset("vmpooler__odrequest__#{request_id}", 'status', 'failed') + redis.hset("vmpooler__odrequest__#{request_id}", 'failure_reason', failure_reason) + $logger.log('s', "[!] [#{pool}] '#{vm}' permanently failed: #{failure_reason}") + $metrics.increment("errors.permanently_failed.#{pool}") + end end end $metrics.increment("errors.markedasfailed.#{pool}") - open_socket_error + open_socket_error || clone_error + end + + # Determine if an error is likely permanent (configuration issue) vs transient + def is_permanent_error?(error_message, error_class) + return false if error_message.nil? || error_class.nil? + + permanent_error_patterns = [ + /template.*not found/i, + /template.*does not exist/i, + /invalid.*path/i, + /folder.*not found/i, + /datastore.*not found/i, + /resource pool.*not found/i, + /permission.*denied/i, + /authentication.*failed/i, + /invalid.*credentials/i, + /configuration.*error/i + ] + + permanent_error_classes = [ + 'ArgumentError', + 'NoMethodError', + 'NameError' + ] + + # Check error message patterns + permanent_error_patterns.any? { |pattern| error_message.match?(pattern) } || + # Check error class types + permanent_error_classes.include?(error_class) end def move_pending_vm_to_ready(vm, pool, redis, request_id = nil) @@ -489,14 +543,18 @@ module Vmpooler dns_plugin_class_name = get_dns_plugin_class_name_for_pool(pool_name) dns_plugin.create_or_replace_record(new_vmname) unless dns_plugin_class_name == 'dynamic-dns' - rescue StandardError + rescue StandardError => e + # Store error details for retry decision making @redis.with_metrics do |redis| redis.pipelined do |pipeline| pipeline.srem("vmpooler__pending__#{pool_name}", new_vmname) + pipeline.hset("vmpooler__vm__#{new_vmname}", 'clone_error', e.message) + pipeline.hset("vmpooler__vm__#{new_vmname}", 'clone_error_class', e.class.name) expiration_ttl = $config[:redis]['data_ttl'].to_i * 60 * 60 pipeline.expire("vmpooler__vm__#{new_vmname}", expiration_ttl) end end + $logger.log('s', "[!] [#{pool_name}] '#{new_vmname}' clone failed: #{e.class}: #{e.message}") raise ensure @redis.with_metrics do |redis| diff --git a/spec/unit/pool_manager_spec.rb b/spec/unit/pool_manager_spec.rb index 3ca075e..c7b44c0 100644 --- a/spec/unit/pool_manager_spec.rb +++ b/spec/unit/pool_manager_spec.rb @@ -345,6 +345,123 @@ EOT end end + describe '#handle_timed_out_vm' do + before do + expect(subject).not_to be_nil + end + + before(:each) do + redis_connection_pool.with do |redis| + create_pending_vm(pool, vm, redis) + config[:config]['max_vm_retries'] = 3 + end + end + + context 'without request_id' do + it 'moves VM to completed queue and returns error' do + redis_connection_pool.with do |redis| + redis.hset("vmpooler__vm__#{vm}", 'open_socket_error', 'connection failed') + result = subject.handle_timed_out_vm(vm, pool, redis) + + expect(redis.sismember("vmpooler__pending__#{pool}", vm)).to be(false) + expect(redis.sismember("vmpooler__completed__#{pool}", vm)).to be(true) + expect(result).to eq('connection failed') + end + end + end + + context 'with request_id and transient error' do + before(:each) do + redis_connection_pool.with do |redis| + redis.hset("vmpooler__vm__#{vm}", 'request_id', request_id) + redis.hset("vmpooler__vm__#{vm}", 'pool_alias', pool) + redis.hset("vmpooler__odrequest__#{request_id}", 'status', 'pending') + redis.hset("vmpooler__vm__#{vm}", 'clone_error', 'network timeout') + redis.hset("vmpooler__vm__#{vm}", 'clone_error_class', 'Timeout::Error') + end + end + + it 'retries on first failure' do + redis_connection_pool.with do |redis| + subject.handle_timed_out_vm(vm, pool, redis) + + expect(redis.hget("vmpooler__odrequest__#{request_id}", 'retry_count')).to eq('1') + expect(redis.zrange('vmpooler__odcreate__task', 0, -1)).to include("#{pool}:#{pool}:1:#{request_id}") + end + end + + it 'marks as failed after max retries' do + redis_connection_pool.with do |redis| + redis.hset("vmpooler__odrequest__#{request_id}", 'retry_count', '3') + + subject.handle_timed_out_vm(vm, pool, redis) + + expect(redis.hget("vmpooler__odrequest__#{request_id}", 'status')).to eq('failed') + expect(redis.hget("vmpooler__odrequest__#{request_id}", 'failure_reason')).to eq('Max retry attempts exceeded') + expect(redis.zrange('vmpooler__odcreate__task', 0, -1)).not_to include("#{pool}:#{pool}:1:#{request_id}") + end + end + end + + context 'with request_id and permanent error' do + before(:each) do + redis_connection_pool.with do |redis| + redis.hset("vmpooler__vm__#{vm}", 'request_id', request_id) + redis.hset("vmpooler__vm__#{vm}", 'pool_alias', pool) + redis.hset("vmpooler__odrequest__#{request_id}", 'status', 'pending') + redis.hset("vmpooler__vm__#{vm}", 'clone_error', 'template not found') + redis.hset("vmpooler__vm__#{vm}", 'clone_error_class', 'RuntimeError') + end + end + + it 'immediately marks as failed without retrying' do + redis_connection_pool.with do |redis| + subject.handle_timed_out_vm(vm, pool, redis) + + expect(redis.hget("vmpooler__odrequest__#{request_id}", 'status')).to eq('failed') + expect(redis.hget("vmpooler__odrequest__#{request_id}", 'failure_reason')).to include('Configuration error') + expect(redis.zrange('vmpooler__odcreate__task', 0, -1)).not_to include("#{pool}:#{pool}:1:#{request_id}") + end + end + end + end + + describe '#is_permanent_error?' do + before do + expect(subject).not_to be_nil + end + + it 'identifies template not found errors as permanent' do + expect(subject.is_permanent_error?('template not found', 'RuntimeError')).to be(true) + end + + it 'identifies invalid path errors as permanent' do + expect(subject.is_permanent_error?('invalid path specified', 'ArgumentError')).to be(true) + end + + it 'identifies permission denied errors as permanent' do + expect(subject.is_permanent_error?('permission denied', 'SecurityError')).to be(true) + end + + it 'identifies ArgumentError class as permanent' do + expect(subject.is_permanent_error?('some argument error', 'ArgumentError')).to be(true) + end + + it 'identifies network errors as transient' do + expect(subject.is_permanent_error?('connection timeout', 'Timeout::Error')).to be(false) + end + + it 'identifies socket errors as transient' do + expect(subject.is_permanent_error?('connection refused', 'Errno::ECONNREFUSED')).to be(false) + end + + it 'returns false for nil inputs' do + expect(subject.is_permanent_error?(nil, nil)).to be(false) + expect(subject.is_permanent_error?('error', nil)).to be(false) + expect(subject.is_permanent_error?(nil, 'Error')).to be(false) + end + end + describe '#move_pending_vm_to_ready' do let(:host) { { 'hostname' => vm }} diff --git a/vmpooler.yaml.example b/vmpooler.yaml.example index 818183e..f05ded2 100644 --- a/vmpooler.yaml.example +++ b/vmpooler.yaml.example @@ -456,6 +456,12 @@ # How long (in minutes) before marking a clone in 'pending' queues as 'failed' and retrying. # (default: 15) # +# - max_vm_retries +# Maximum number of times to retry VM creation for a failed request before marking it as permanently failed. +# This helps prevent infinite retry loops when there are configuration issues like invalid template paths. +# Permanent errors (like invalid template paths) are detected and will not be retried. +# (default: 3) +# # - vm_checktime # How often (in minutes) to check the sanity of VMs in 'ready' queues. # (default: 1) @@ -619,6 +625,7 @@ vm_checktime: 1 vm_lifetime: 12 vm_lifetime_auth: 24 + max_vm_retries: 3 allowed_tags: - 'created_by' - 'project' From 9e75854ec442683919488c8c426c0ef9f03c1230 Mon Sep 17 00:00:00 2001 From: Mahima Singh <105724608+smahima27@users.noreply.github.com> Date: Thu, 4 Dec 2025 16:12:23 +0530 Subject: [PATCH 2/6] Fixed robo issues --- lib/vmpooler/pool_manager.rb | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/lib/vmpooler/pool_manager.rb b/lib/vmpooler/pool_manager.rb index d8aea0d..b9bae34 100644 --- a/lib/vmpooler/pool_manager.rb +++ b/lib/vmpooler/pool_manager.rb @@ -164,17 +164,17 @@ module Vmpooler clone_error = redis.hget("vmpooler__vm__#{vm}", 'clone_error') clone_error_class = redis.hget("vmpooler__vm__#{vm}", 'clone_error_class') redis.smove("vmpooler__pending__#{pool}", "vmpooler__completed__#{pool}", vm) - + if request_id ondemandrequest_hash = redis.hgetall("vmpooler__odrequest__#{request_id}") if ondemandrequest_hash && ondemandrequest_hash['status'] != 'failed' && ondemandrequest_hash['status'] != 'deleted' # Check retry count and max retry limit before retrying retry_count = (redis.hget("vmpooler__odrequest__#{request_id}", 'retry_count') || '0').to_i max_retries = $config[:config]['max_vm_retries'] || 3 - + # Determine if error is likely permanent (configuration issues) - permanent_error = is_permanent_error?(clone_error, clone_error_class) - + permanent_error = permanent_error?(clone_error, clone_error_class) + if retry_count < max_retries && !permanent_error # Increment retry count and retry VM creation redis.hset("vmpooler__odrequest__#{request_id}", 'retry_count', retry_count + 1) @@ -199,9 +199,9 @@ module Vmpooler end # Determine if an error is likely permanent (configuration issue) vs transient - def is_permanent_error?(error_message, error_class) + def permanent_error?(error_message, error_class) return false if error_message.nil? || error_class.nil? - + permanent_error_patterns = [ /template.*not found/i, /template.*does not exist/i, @@ -214,17 +214,17 @@ module Vmpooler /invalid.*credentials/i, /configuration.*error/i ] - + permanent_error_classes = [ 'ArgumentError', 'NoMethodError', 'NameError' ] - + # Check error message patterns permanent_error_patterns.any? { |pattern| error_message.match?(pattern) } || - # Check error class types - permanent_error_classes.include?(error_class) + # Check error class types + permanent_error_classes.include?(error_class) end def move_pending_vm_to_ready(vm, pool, redis, request_id = nil) From 8372ea824f501fdbbcc4d04ad151e2831d447540 Mon Sep 17 00:00:00 2001 From: Mahima Singh <105724608+smahima27@users.noreply.github.com> Date: Thu, 4 Dec 2025 16:19:34 +0530 Subject: [PATCH 3/6] Fixed spec tests --- spec/unit/pool_manager_spec.rb | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/spec/unit/pool_manager_spec.rb b/spec/unit/pool_manager_spec.rb index c7b44c0..abe5555 100644 --- a/spec/unit/pool_manager_spec.rb +++ b/spec/unit/pool_manager_spec.rb @@ -426,39 +426,39 @@ EOT end end - describe '#is_permanent_error?' do + describe '#permanent_error?' do before do expect(subject).not_to be_nil end it 'identifies template not found errors as permanent' do - expect(subject.is_permanent_error?('template not found', 'RuntimeError')).to be(true) + expect(subject.permanent_error?('template not found', 'RuntimeError')).to be(true) end it 'identifies invalid path errors as permanent' do - expect(subject.is_permanent_error?('invalid path specified', 'ArgumentError')).to be(true) + expect(subject.permanent_error?('invalid path specified', 'ArgumentError')).to be(true) end it 'identifies permission denied errors as permanent' do - expect(subject.is_permanent_error?('permission denied', 'SecurityError')).to be(true) + expect(subject.permanent_error?('permission denied', 'SecurityError')).to be(true) end it 'identifies ArgumentError class as permanent' do - expect(subject.is_permanent_error?('some argument error', 'ArgumentError')).to be(true) + expect(subject.permanent_error?('some argument error', 'ArgumentError')).to be(true) end it 'identifies network errors as transient' do - expect(subject.is_permanent_error?('connection timeout', 'Timeout::Error')).to be(false) + expect(subject.permanent_error?('connection timeout', 'Timeout::Error')).to be(false) end it 'identifies socket errors as transient' do - expect(subject.is_permanent_error?('connection refused', 'Errno::ECONNREFUSED')).to be(false) + expect(subject.permanent_error?('connection refused', 'Errno::ECONNREFUSED')).to be(false) end it 'returns false for nil inputs' do - expect(subject.is_permanent_error?(nil, nil)).to be(false) - expect(subject.is_permanent_error?('error', nil)).to be(false) - expect(subject.is_permanent_error?(nil, 'Error')).to be(false) + expect(subject.permanent_error?(nil, nil)).to be(false) + expect(subject.permanent_error?('error', nil)).to be(false) + expect(subject.permanent_error?(nil, 'Error')).to be(false) end end From 0e8c3c66e9e0d755054d8d7a3d77298ff622b263 Mon Sep 17 00:00:00 2001 From: Mahima Singh <105724608+smahima27@users.noreply.github.com> Date: Thu, 18 Dec 2025 22:35:06 +0530 Subject: [PATCH 4/6] Add debug logging to retry logic for troubleshooting --- lib/vmpooler/pool_manager.rb | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/vmpooler/pool_manager.rb b/lib/vmpooler/pool_manager.rb index b9bae34..375d9ea 100644 --- a/lib/vmpooler/pool_manager.rb +++ b/lib/vmpooler/pool_manager.rb @@ -172,8 +172,11 @@ module Vmpooler retry_count = (redis.hget("vmpooler__odrequest__#{request_id}", 'retry_count') || '0').to_i max_retries = $config[:config]['max_vm_retries'] || 3 + $logger.log('s', "[!] [#{pool}] '#{vm}' checking retry logic: error='#{clone_error}', error_class='#{clone_error_class}', retry_count=#{retry_count}, max_retries=#{max_retries}") + # Determine if error is likely permanent (configuration issues) permanent_error = permanent_error?(clone_error, clone_error_class) + $logger.log('s', "[!] [#{pool}] '#{vm}' permanent_error check result: #{permanent_error}") if retry_count < max_retries && !permanent_error # Increment retry count and retry VM creation From 095b507a932f5a1c4b6a8346daf3aa68749977a1 Mon Sep 17 00:00:00 2001 From: Mahima Singh <105724608+smahima27@users.noreply.github.com> Date: Fri, 19 Dec 2025 12:09:03 +0530 Subject: [PATCH 5/6] Add retry logic for immediate clone failures - Check permanent_error? and retry count when clone fails immediately - Cancel request if permanent error or max retries exceeded - Re-queue request for retry if transient error and retries remaining - Log retry decisions for debugging --- lib/vmpooler/pool_manager.rb | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/lib/vmpooler/pool_manager.rb b/lib/vmpooler/pool_manager.rb index 375d9ea..a136c87 100644 --- a/lib/vmpooler/pool_manager.rb +++ b/lib/vmpooler/pool_manager.rb @@ -556,6 +556,27 @@ module Vmpooler expiration_ttl = $config[:redis]['data_ttl'].to_i * 60 * 60 pipeline.expire("vmpooler__vm__#{new_vmname}", expiration_ttl) end + + # Handle retry logic for on-demand requests + if request_id + retry_count = (redis.hget("vmpooler__odrequest__#{request_id}", 'retry_count') || '0').to_i + max_retries = $config[:config]['max_vm_retries'] || 3 + is_permanent = permanent_error?(e.message, e.class.name) + + $logger.log('s', "[!] [#{pool_name}] '#{new_vmname}' checking immediate failure retry: error='#{e.message}', error_class='#{e.class.name}', retry_count=#{retry_count}, max_retries=#{max_retries}, permanent_error=#{is_permanent}") + + if is_permanent || retry_count >= max_retries + reason = is_permanent ? 'permanent error detected' : 'max retries exceeded' + $logger.log('s', "[!] [#{pool_name}] Cancelling request #{request_id} due to #{reason}") + redis.hset("vmpooler__odrequest__#{request_id}", 'status', 'failed') + redis.zadd('vmpooler__odcreate__task', 0, "#{pool_alias}:#{pool_name}:0:#{request_id}") + else + # Increment retry count and re-queue for retry + redis.hincrby("vmpooler__odrequest__#{request_id}", 'retry_count', 1) + $logger.log('s', "[+] [#{pool_name}] Request #{request_id} will be retried (attempt #{retry_count + 1}/#{max_retries})") + redis.zadd('vmpooler__odcreate__task', 1, "#{pool_alias}:#{pool_name}:1:#{request_id}") + end + end end $logger.log('s', "[!] [#{pool_name}] '#{new_vmname}' clone failed: #{e.class}: #{e.message}") raise From cd50c8ea650b2b630c75d1a56581135e62243605 Mon Sep 17 00:00:00 2001 From: Mahima Singh <105724608+smahima27@users.noreply.github.com> Date: Fri, 19 Dec 2025 12:18:14 +0530 Subject: [PATCH 6/6] Prevent re-queueing requests already marked as failed - Check request status before re-queueing in clone_vm rescue block - Only re-queue if status is not 'failed' - Prevents infinite loop when permanent errors are detected --- lib/vmpooler/pool_manager.rb | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/lib/vmpooler/pool_manager.rb b/lib/vmpooler/pool_manager.rb index a136c87..fe55d74 100644 --- a/lib/vmpooler/pool_manager.rb +++ b/lib/vmpooler/pool_manager.rb @@ -423,7 +423,13 @@ module Vmpooler if request_id $logger.log('s', "[!] [#{pool_name}] failed while cloning VM for request #{request_id} with an error: #{e}") @redis.with_metrics do |redis| - redis.zadd('vmpooler__odcreate__task', 1, "#{pool_alias}:#{pool_name}:1:#{request_id}") + # Only re-queue if the request wasn't already marked as failed (e.g., by permanent error detection) + request_status = redis.hget("vmpooler__odrequest__#{request_id}", 'status') + if request_status != 'failed' + redis.zadd('vmpooler__odcreate__task', 1, "#{pool_alias}:#{pool_name}:1:#{request_id}") + else + $logger.log('s', "[!] [#{pool_name}] Request #{request_id} already marked as failed, not re-queueing") + end end else $logger.log('s', "[!] [#{pool_name}] failed while cloning VM with an error: #{e}")