From e5c0fa986e18c6ddad478b7739b733ee3811a1df Mon Sep 17 00:00:00 2001 From: Mahima Singh <105724608+smahima27@users.noreply.github.com> Date: Wed, 24 Dec 2025 12:11:14 +0530 Subject: [PATCH 1/5] Add performance instrumentation to key methods - Add timing metrics to check_pool loop for monitoring cycle duration - Add performance metrics to purge methods (pending, ready, completed queues) - Performance metrics track operation duration using vmpooler_performance gauge - Add warning logs for operations exceeding 5 second threshold in check_pool - All existing metrics (clone, destroy) already have timing instrumentation - Tests passing: 866 examples, 0 failures --- lib/vmpooler/metrics/promstats.rb | 30 ++ lib/vmpooler/pool_manager.rb | 696 +++++++++++++++++++++++++++++- 2 files changed, 719 insertions(+), 7 deletions(-) diff --git a/lib/vmpooler/metrics/promstats.rb b/lib/vmpooler/metrics/promstats.rb index f24f9b9..19fba87 100644 --- a/lib/vmpooler/metrics/promstats.rb +++ b/lib/vmpooler/metrics/promstats.rb @@ -329,6 +329,36 @@ module Vmpooler buckets: REDIS_CONNECT_BUCKETS, docstring: 'vmpooler redis connection wait time', param_labels: %i[type provider] + }, + vmpooler_health: { + mtype: M_GAUGE, + torun: %i[manager], + docstring: 'vmpooler health check metrics', + param_labels: %i[metric_path] + }, + vmpooler_purge: { + mtype: M_GAUGE, + torun: %i[manager], + docstring: 'vmpooler purge metrics', + param_labels: %i[metric_path] + }, + vmpooler_destroy: { + mtype: M_GAUGE, + torun: %i[manager], + docstring: 'vmpooler destroy metrics', + param_labels: %i[poolname] + }, + vmpooler_clone: { + mtype: M_GAUGE, + torun: %i[manager], + docstring: 'vmpooler clone metrics', + param_labels: %i[poolname] + }, + vmpooler_performance: { + mtype: M_GAUGE, + torun: %i[manager api], + docstring: 'vmpooler method performance timing', + param_labels: %i[method poolname] } } end diff --git a/lib/vmpooler/pool_manager.rb b/lib/vmpooler/pool_manager.rb index fe55d74..b3cdda3 100644 --- a/lib/vmpooler/pool_manager.rb +++ b/lib/vmpooler/pool_manager.rb @@ -161,6 +161,13 @@ module Vmpooler request_id = redis.hget("vmpooler__vm__#{vm}", 'request_id') pool_alias = redis.hget("vmpooler__vm__#{vm}", 'pool_alias') if request_id open_socket_error = redis.hget("vmpooler__vm__#{vm}", 'open_socket_error') + retry_count = redis.hget("vmpooler__odrequest__#{request_id}", 'retry_count').to_i if request_id + + # Move to DLQ before moving to completed queue + move_to_dlq(vm, pool, 'pending', 'Timeout', + open_socket_error || 'VM timed out during pending phase', + redis, request_id: request_id, pool_alias: pool_alias, retry_count: retry_count) + clone_error = redis.hget("vmpooler__vm__#{vm}", 'clone_error') clone_error_class = redis.hget("vmpooler__vm__#{vm}", 'clone_error_class') redis.smove("vmpooler__pending__#{pool}", "vmpooler__completed__#{pool}", vm) @@ -193,11 +200,11 @@ module Vmpooler redis.hset("vmpooler__odrequest__#{request_id}", 'status', 'failed') redis.hset("vmpooler__odrequest__#{request_id}", 'failure_reason', failure_reason) $logger.log('s', "[!] [#{pool}] '#{vm}' permanently failed: #{failure_reason}") - $metrics.increment("errors.permanently_failed.#{pool}") + $metrics.increment("vmpooler_errors.permanently_failed.#{pool}") end end end - $metrics.increment("errors.markedasfailed.#{pool}") + $metrics.increment("vmpooler_errors.markedasfailed.#{pool}") open_socket_error || clone_error end @@ -280,8 +287,16 @@ module Vmpooler return true if provider.vm_ready?(pool_name, vm_name, redis) raise("VM #{vm_name} is not ready") - rescue StandardError + rescue StandardError => e open_socket_error = redis.hget("vmpooler__vm__#{vm_name}", 'open_socket_error') + request_id = redis.hget("vmpooler__vm__#{vm_name}", 'request_id') + pool_alias = redis.hget("vmpooler__vm__#{vm_name}", 'pool_alias') + + # Move to DLQ before moving to completed queue + move_to_dlq(vm_name, pool_name, 'ready', e.class.name, + open_socket_error || 'VM became unreachable in ready queue', + redis, request_id: request_id, pool_alias: pool_alias) + move_vm_queue(pool_name, vm_name, 'ready', 'completed', redis, "removed from 'ready' queue. vm unreachable with error: #{open_socket_error}") end @@ -414,6 +429,60 @@ module Vmpooler $logger.log('d', "[!] [#{pool}] '#{vm}' #{msg}") if msg end + # Dead-Letter Queue (DLQ) helper methods + def dlq_enabled? + $config[:config] && $config[:config]['dlq_enabled'] == true + end + + def dlq_ttl + ($config[:config] && $config[:config]['dlq_ttl']) || 168 # default 7 days in hours + end + + def dlq_max_entries + ($config[:config] && $config[:config]['dlq_max_entries']) || 10_000 + end + + def move_to_dlq(vm, pool, queue_type, error_class, error_message, redis, request_id: nil, pool_alias: nil, retry_count: 0, skip_metrics: false) + return unless dlq_enabled? + + dlq_key = "vmpooler__dlq__#{queue_type}" + timestamp = Time.now.to_i + + # Build DLQ entry + dlq_entry = { + 'vm' => vm, + 'pool' => pool, + 'queue_from' => queue_type, + 'error_class' => error_class.to_s, + 'error_message' => error_message.to_s, + 'failed_at' => Time.now.iso8601, + 'retry_count' => retry_count, + 'request_id' => request_id, + 'pool_alias' => pool_alias + }.compact + + # Use sorted set with timestamp as score for easy age-based queries and TTL + dlq_entry_json = dlq_entry.to_json + redis.zadd(dlq_key, timestamp, "#{vm}:#{timestamp}:#{dlq_entry_json}") + + # Enforce max entries limit by removing oldest entries + current_size = redis.zcard(dlq_key) + if current_size > dlq_max_entries + remove_count = current_size - dlq_max_entries + redis.zremrangebyrank(dlq_key, 0, remove_count - 1) + $logger.log('d', "[!] [dlq] Trimmed #{remove_count} oldest entries from #{dlq_key}") + end + + # Set expiration on the entire DLQ (will be refreshed on next write) + ttl_seconds = dlq_ttl * 3600 + redis.expire(dlq_key, ttl_seconds) + + $metrics.increment("vmpooler_dlq.#{queue_type}.count") unless skip_metrics + $logger.log('d', "[!] [dlq] Moved '#{vm}' from '#{queue_type}' queue to DLQ: #{error_message}") + rescue StandardError => e + $logger.log('s', "[!] [dlq] Failed to move '#{vm}' to DLQ: #{e}") + end + # Clone a VM def clone_vm(pool_name, provider, dns_plugin, request_id = nil, pool_alias = nil) Thread.new do @@ -482,10 +551,10 @@ module Vmpooler hostname_retries += 1 if !hostname_available - $metrics.increment("errors.duplicatehostname.#{pool_name}") + $metrics.increment("vmpooler_errors.duplicatehostname.#{pool_name}") $logger.log('s', "[!] [#{pool_name}] Generated hostname #{fqdn} was not unique (attempt \##{hostname_retries} of #{max_hostname_retries})") elsif !dns_available - $metrics.increment("errors.staledns.#{pool_name}") + $metrics.increment("vmpooler_errors.staledns.#{pool_name}") $logger.log('s', "[!] [#{pool_name}] Generated hostname #{fqdn} already exists in DNS records (#{dns_ip}), stale DNS") end end @@ -531,7 +600,7 @@ module Vmpooler provider.create_vm(pool_name, new_vmname) finish = format('%