diff --git a/Gemfile b/Gemfile index 122d6b5..0313b80 100644 --- a/Gemfile +++ b/Gemfile @@ -3,11 +3,11 @@ source ENV['GEM_SOURCE'] || 'https://rubygems.org' gemspec # Evaluate Gemfile.local if it exists -if File.exists? "#{__FILE__}.local" +if File.exist? "#{__FILE__}.local" instance_eval(File.read("#{__FILE__}.local")) end # Evaluate ~/.gemfile if it exists -if File.exists?(File.join(Dir.home, '.gemfile')) +if File.exist?(File.join(Dir.home, '.gemfile')) instance_eval(File.read(File.join(Dir.home, '.gemfile'))) end diff --git a/Gemfile.lock b/Gemfile.lock index 418f24d..2099da1 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -197,6 +197,7 @@ GEM PLATFORMS arm64-darwin-22 arm64-darwin-23 + arm64-darwin-25 universal-java-11 universal-java-17 x86_64-darwin-22 diff --git a/IMPLEMENTATION_SUMMARY.md b/IMPLEMENTATION_SUMMARY.md deleted file mode 100644 index 0e5e432..0000000 --- a/IMPLEMENTATION_SUMMARY.md +++ /dev/null @@ -1,375 +0,0 @@ -# Implementation Summary: Redis Queue Reliability Features - -## Overview -Successfully implemented Dead-Letter Queue (DLQ), Auto-Purge, and Health Check features for VMPooler to improve Redis queue reliability and observability. - -## Branch -- **Repository**: `/Users/mahima.singh/vmpooler-projects/Vmpooler/vmpooler` -- **Branch**: `P4DEVOPS-8567` (created from main) -- **Status**: Implementation complete, ready for testing - -## What Was Implemented - -### 1. Dead-Letter Queue (DLQ) -**Purpose**: Capture and track failed VM operations for visibility and debugging. - -**Files Modified**: -- [`lib/vmpooler/pool_manager.rb`](/Users/mahima.singh/vmpooler-projects/Vmpooler/vmpooler/lib/vmpooler/pool_manager.rb) - - Added `dlq_enabled?`, `dlq_ttl`, `dlq_max_entries` helper methods - - Added `move_to_dlq` method to capture failures - - Updated `handle_timed_out_vm` to use DLQ - - Updated `_clone_vm` rescue block to use DLQ - - Updated `vm_still_ready?` rescue block to use DLQ - -**Features**: -- ✅ Captures failures from pending, clone, and ready queues -- ✅ Stores complete failure context (VM, pool, error, timestamp, retry count, request ID) -- ✅ Uses Redis sorted sets (scored by timestamp) for easy age-based queries -- ✅ Enforces TTL-based expiration (default 7 days) -- ✅ Enforces max entries limit to prevent unbounded growth -- ✅ Automatically trims oldest entries when limit reached -- ✅ Increments metrics for DLQ operations - -**DLQ Keys**: -- `vmpooler__dlq__pending` - Failed pending VMs -- `vmpooler__dlq__clone` - Failed clone operations -- `vmpooler__dlq__ready` - Failed ready queue VMs - -### 2. Auto-Purge Mechanism -**Purpose**: Automatically remove stale entries from queues to prevent resource leaks. - -**Files Modified**: -- [`lib/vmpooler/pool_manager.rb`](/Users/mahima.singh/vmpooler-projects/Vmpooler/vmpooler/lib/vmpooler/pool_manager.rb) - - Added `purge_enabled?`, `purge_dry_run?` helper methods - - Added age threshold methods: `max_pending_age`, `max_ready_age`, `max_completed_age`, `max_orphaned_age` - - Added `purge_stale_queue_entries` main loop - - Added `purge_pending_queue`, `purge_ready_queue`, `purge_completed_queue` methods - - Added `purge_orphaned_metadata` method - - Integrated purge thread into main execution loop - -**Features**: -- ✅ Purges pending VMs stuck longer than threshold (default 2 hours) -- ✅ Purges ready VMs idle longer than threshold (default 24 hours) -- ✅ Purges completed VMs older than threshold (default 1 hour) -- ✅ Detects and expires orphaned VM metadata -- ✅ Moves purged pending VMs to DLQ for visibility -- ✅ Dry-run mode for testing (logs without purging) -- ✅ Configurable purge interval (default 1 hour) -- ✅ Increments per-pool purge metrics -- ✅ Runs in background thread - -### 3. Health Checks -**Purpose**: Monitor queue health and expose metrics for alerting and dashboards. - -**Files Modified**: -- [`lib/vmpooler/pool_manager.rb`](/Users/mahima.singh/vmpooler-projects/Vmpooler/vmpooler/lib/vmpooler/pool_manager.rb) - - Added `health_check_enabled?`, `health_thresholds` helper methods - - Added `check_queue_health` main method - - Added `calculate_health_metrics` to gather queue metrics - - Added `calculate_queue_ages` helper - - Added `count_orphaned_metadata` helper - - Added `determine_health_status` to classify health (healthy/degraded/unhealthy) - - Added `log_health_summary` for log output - - Added `push_health_metrics` to expose metrics - - Integrated health check thread into main execution loop - -**Features**: -- ✅ Monitors per-pool queue sizes (pending, ready, completed) -- ✅ Calculates queue ages (oldest, average) -- ✅ Detects stuck VMs (age > threshold) -- ✅ Monitors DLQ sizes -- ✅ Counts orphaned metadata -- ✅ Monitors task queue sizes (clone, on-demand) -- ✅ Determines overall health status (healthy/degraded/unhealthy) -- ✅ Stores metrics in Redis for API consumption (`vmpooler__health`) -- ✅ Pushes metrics to metrics system (Prometheus, Graphite) -- ✅ Logs periodic health summary -- ✅ Configurable thresholds and intervals -- ✅ Runs in background thread - -## Configuration - -**Files Created**: -- [`vmpooler.yml.example`](/Users/mahima.singh/vmpooler-projects/Vmpooler/vmpooler.yml.example) - Example configuration showing all options - -**Configuration Options**: - -```yaml -:config: - # Dead-Letter Queue - dlq_enabled: false # Set to true to enable - dlq_ttl: 168 # hours (7 days) - dlq_max_entries: 10000 - - # Auto-Purge - purge_enabled: false # Set to true to enable - purge_interval: 3600 # seconds (1 hour) - purge_dry_run: false # Set to true for testing - max_pending_age: 7200 # 2 hours - max_ready_age: 86400 # 24 hours - max_completed_age: 3600 # 1 hour - max_orphaned_age: 86400 # 24 hours - - # Health Checks - health_check_enabled: false # Set to true to enable - health_check_interval: 300 # seconds (5 minutes) - health_thresholds: - pending_queue_max: 100 - ready_queue_max: 500 - dlq_max_warning: 100 - dlq_max_critical: 1000 - stuck_vm_age_threshold: 7200 - stuck_vm_max_warning: 10 - stuck_vm_max_critical: 50 -``` - -## Documentation - -**Files Created**: -1. [`REDIS_QUEUE_RELIABILITY.md`](/Users/mahima.singh/vmpooler-projects/Vmpooler/REDIS_QUEUE_RELIABILITY.md) - - Comprehensive design document - - Feature requirements with acceptance criteria - - Implementation plan and phases - - Configuration examples - - Metrics definitions - -2. [`QUEUE_RELIABILITY_OPERATOR_GUIDE.md`](/Users/mahima.singh/vmpooler-projects/Vmpooler/QUEUE_RELIABILITY_OPERATOR_GUIDE.md) - - Complete operator guide - - Feature descriptions and benefits - - Configuration examples - - Common scenarios and troubleshooting - - Best practices - - Migration guide - -## Testing - -**Files Created**: -- [`spec/unit/queue_reliability_spec.rb`](/Users/mahima.singh/vmpooler-projects/Vmpooler/vmpooler/spec/unit/queue_reliability_spec.rb) - - 30+ unit tests covering: - - DLQ helper methods and operations - - Purge helper methods and queue operations - - Health check calculations and status determination - - Metric push operations - -**Test Coverage**: -- ✅ DLQ enabled/disabled states -- ✅ DLQ TTL and max entries configuration -- ✅ DLQ entry creation with all fields -- ✅ DLQ max entries enforcement -- ✅ Purge enabled/disabled states -- ✅ Purge dry-run mode -- ✅ Purge age threshold configuration -- ✅ Purge pending, ready, completed queues -- ✅ Purge orphaned metadata detection -- ✅ Health check enabled/disabled states -- ✅ Health threshold configuration -- ✅ Queue age calculations -- ✅ Health status determination (healthy/degraded/unhealthy) -- ✅ Metric push operations - -## Code Quality - -**Validation**: -- ✅ Ruby syntax check passed: `ruby -c lib/vmpooler/pool_manager.rb` → Syntax OK -- ✅ No compilation errors -- ✅ Follows existing VMPooler code patterns -- ✅ Proper error handling with rescue blocks -- ✅ Logging at appropriate levels ('s' for significant, 'd' for debug) -- ✅ Metrics increments and gauges - -## Metrics - -**New Metrics Added**: - -``` -# DLQ metrics -vmpooler.dlq.pending.count -vmpooler.dlq.clone.count -vmpooler.dlq.ready.count - -# Purge metrics -vmpooler.purge.pending..count -vmpooler.purge.ready..count -vmpooler.purge.completed..count -vmpooler.purge.orphaned.count -vmpooler.purge.cycle.duration -vmpooler.purge.total.count - -# Health metrics -vmpooler.health.status # 0=healthy, 1=degraded, 2=unhealthy -vmpooler.health.dlq.total_size -vmpooler.health.stuck_vms.count -vmpooler.health.orphaned_metadata.count -vmpooler.health.queue..pending.size -vmpooler.health.queue..pending.oldest_age -vmpooler.health.queue..pending.stuck_count -vmpooler.health.queue..ready.size -vmpooler.health.queue..ready.oldest_age -vmpooler.health.queue..completed.size -vmpooler.health.dlq..size -vmpooler.health.tasks.clone.active -vmpooler.health.tasks.ondemand.active -vmpooler.health.tasks.ondemand.pending -vmpooler.health.check.duration -``` - -## Next Steps - -### 1. Local Testing -```bash -cd /Users/mahima.singh/vmpooler-projects/Vmpooler/vmpooler - -# Run unit tests -bundle exec rspec spec/unit/queue_reliability_spec.rb - -# Run all tests -bundle exec rspec -``` - -### 2. Enable Features in Development -Update your vmpooler configuration: -```yaml -:config: - # Start with DLQ only - dlq_enabled: true - dlq_ttl: 24 # Short TTL for dev - - # Enable purge in dry-run mode first - purge_enabled: true - purge_dry_run: true - purge_interval: 600 # Check every 10 minutes - max_pending_age: 1800 # 30 minutes - - # Enable health checks - health_check_enabled: true - health_check_interval: 60 # Check every minute -``` - -### 3. Monitor Logs -Watch for: -```bash -# DLQ operations -grep "dlq" vmpooler.log - -# Purge operations (dry-run) -grep "purge.*dry-run" vmpooler.log - -# Health checks -grep "health" vmpooler.log -``` - -### 4. Query Redis -```bash -# Check DLQ entries -redis-cli ZCARD vmpooler__dlq__pending -redis-cli ZRANGE vmpooler__dlq__pending 0 9 - -# Check health status -redis-cli HGETALL vmpooler__health -``` - -### 5. Deployment Plan -1. **Dev Environment**: - - Enable all features with aggressive thresholds - - Monitor for 1 week - - Verify DLQ captures failures correctly - - Verify purge detects stale entries (dry-run) - - Verify health status is accurate - -2. **Staging Environment**: - - Enable DLQ and health checks - - Enable purge in dry-run mode - - Monitor for 1 week - - Review DLQ patterns - - Tune thresholds based on actual usage - -3. **Production Environment**: - - Enable DLQ and health checks - - Enable purge in dry-run mode initially - - Monitor for 2 weeks - - Verify no false positives - - Enable purge in live mode - - Set up alerting based on health metrics - -### 6. Testing Checklist -- [ ] Run unit tests: `bundle exec rspec spec/unit/queue_reliability_spec.rb` -- [ ] Run full test suite: `bundle exec rspec` -- [ ] Start VMPooler with features enabled -- [ ] Create a VM with invalid template → verify DLQ capture -- [ ] Let VM sit in pending too long → verify purge detection (dry-run) -- [ ] Query `vmpooler__health` → verify metrics present -- [ ] Check Prometheus/Graphite → verify metrics pushed -- [ ] Enable purge live mode → verify stale entries removed -- [ ] Monitor logs for thread startup/health - -## Files Changed/Created - -### Modified Files: -1. `/Users/mahima.singh/vmpooler-projects/Vmpooler/vmpooler/lib/vmpooler/pool_manager.rb` - - Added ~350 lines of code - - 3 major features implemented - - Integrated into main execution loop - -### New Files: -1. `/Users/mahima.singh/vmpooler-projects/Vmpooler/REDIS_QUEUE_RELIABILITY.md` (290 lines) -2. `/Users/mahima.singh/vmpooler-projects/Vmpooler/QUEUE_RELIABILITY_OPERATOR_GUIDE.md` (600+ lines) -3. `/Users/mahima.singh/vmpooler-projects/Vmpooler/vmpooler.yml.example` (100+ lines) -4. `/Users/mahima.singh/vmpooler-projects/Vmpooler/vmpooler/spec/unit/queue_reliability_spec.rb` (500+ lines) - -## Backward Compatibility - -✅ **All features are opt-in** via configuration: -- Default: All features disabled (`dlq_enabled: false`, `purge_enabled: false`, `health_check_enabled: false`) -- Existing behavior unchanged when features are disabled -- No breaking changes to existing code or APIs - -## Performance Impact - -**Expected**: -- Redis memory: +1-5MB (depends on DLQ size) -- CPU: +1-2% during purge/health check cycles -- Network: Minimal (metric pushes only) - -**Mitigation**: -- Background threads prevent blocking main pool operations -- Configurable intervals allow tuning based on load -- DLQ max entries limit prevents unbounded growth -- Purge targets only stale entries (age-based) - -## Known Limitations - -1. **DLQ Querying**: Currently requires Redis CLI or custom tooling. Future: Add API endpoints for DLQ queries. -2. **Purge Validation**: Does not check provider to confirm VM still exists before purging. Relies on age thresholds only. -3. **Health Status**: Stored in Redis only, no persistent history. Consider exporting to time-series DB for trending. - -## Future Enhancements - -1. **API Endpoints**: - - `GET /api/v1/queue/dlq` - Query DLQ entries - - `GET /api/v1/queue/health` - Get health metrics - - `POST /api/v1/queue/purge` - Trigger manual purge (admin only) - -2. **Advanced Purge**: - - Provider validation before purging - - Purge on-demand requests that are too old - - Purge VMs without corresponding provider VM - -3. **Advanced Health**: - - Processing rate calculations (VMs/minute) - - Trend analysis (queue size over time) - - Predictive alerting (queue will hit threshold in X minutes) - -## Summary - -Successfully implemented comprehensive queue reliability features for VMPooler: -- **DLQ**: Capture and track all failures -- **Auto-Purge**: Automatically clean up stale entries -- **Health Checks**: Monitor queue health and expose metrics - -All features are: -- ✅ Fully implemented and tested -- ✅ Backward compatible (opt-in) -- ✅ Well documented -- ✅ Ready for testing in development environment - -Total lines of code added: ~1,500 lines (code + tests + docs) diff --git a/QUEUE_RELIABILITY_OPERATOR_GUIDE.md b/QUEUE_RELIABILITY_OPERATOR_GUIDE.md deleted file mode 100644 index 77f383f..0000000 --- a/QUEUE_RELIABILITY_OPERATOR_GUIDE.md +++ /dev/null @@ -1,444 +0,0 @@ -# Queue Reliability Features - Operator Guide - -## Overview - -This guide covers the Dead-Letter Queue (DLQ), Auto-Purge, and Health Check features added to VMPooler for improved queue reliability and observability. - -## Features - -### 1. Dead-Letter Queue (DLQ) - -The DLQ captures failed VM creation attempts and queue transitions, providing visibility into failures without losing data. - -**What gets captured:** -- VMs that fail during clone operations -- VMs that timeout in pending queue -- VMs that become unreachable in ready queue -- Any permanent errors (template not found, permission denied, etc.) - -**Benefits:** -- Failed VMs are not lost - they're moved to DLQ for analysis -- Complete failure context (error message, timestamp, retry count, request ID) -- TTL-based expiration prevents unbounded growth -- Size limiting prevents memory issues - -**Configuration:** -```yaml -:config: - dlq_enabled: true - dlq_ttl: 168 # hours (7 days) - dlq_max_entries: 10000 # per DLQ queue -``` - -**Querying DLQ via Redis CLI:** -```bash -# View all pending DLQ entries -redis-cli ZRANGE vmpooler__dlq__pending 0 -1 - -# View DLQ entries with scores (timestamps) -redis-cli ZRANGE vmpooler__dlq__pending 0 -1 WITHSCORES - -# Get DLQ size -redis-cli ZCARD vmpooler__dlq__pending - -# View recent failures (last 10) -redis-cli ZREVRANGE vmpooler__dlq__clone 0 9 - -# View entries older than 1 hour (timestamp in seconds) -redis-cli ZRANGEBYSCORE vmpooler__dlq__pending -inf $(date -d '1 hour ago' +%s) -``` - -**DLQ Keys:** -- `vmpooler__dlq__pending` - Failed pending VMs -- `vmpooler__dlq__clone` - Failed clone operations -- `vmpooler__dlq__ready` - Failed ready queue VMs -- `vmpooler__dlq__tasks` - Failed tasks - -**Entry Format:** -Each DLQ entry contains: -```json -{ - "vm": "pooler-happy-elephant", - "pool": "centos-7-x86_64", - "queue_from": "pending", - "error_class": "StandardError", - "error_message": "template centos-7-template does not exist", - "failed_at": "2024-01-15T10:30:00Z", - "retry_count": 3, - "request_id": "req-abc123", - "pool_alias": "centos-7" -} -``` - -### 2. Auto-Purge - -Automatically removes stale entries from queues to prevent resource leaks and maintain queue health. - -**What gets purged:** -- **Pending VMs**: Stuck in pending queue longer than `max_pending_age` -- **Ready VMs**: Idle in ready queue longer than `max_ready_age` -- **Completed VMs**: In completed queue longer than `max_completed_age` -- **Orphaned Metadata**: VM metadata without corresponding queue entry - -**Benefits:** -- Prevents queue bloat from stuck/forgotten VMs -- Automatically cleans up after process crashes or bugs -- Configurable thresholds per environment -- Dry-run mode for safe testing - -**Configuration:** -```yaml -:config: - purge_enabled: true - purge_interval: 3600 # seconds (1 hour) - how often to run - purge_dry_run: false # set to true to log but not purge - - # Age thresholds (in seconds) - max_pending_age: 7200 # 2 hours - max_ready_age: 86400 # 24 hours - max_completed_age: 3600 # 1 hour - max_orphaned_age: 86400 # 24 hours -``` - -**Testing Purge (Dry-Run Mode):** -```yaml -:config: - purge_enabled: true - purge_dry_run: true # Logs what would be purged without actually purging - max_pending_age: 600 # Use shorter thresholds for testing -``` - -Watch logs for: -``` -[*] [purge][dry-run] Would purge stale pending VM 'pooler-happy-elephant' (age: 3650s, max: 600s) -``` - -**Monitoring Purge:** -Check logs for purge cycles: -``` -[*] [purge] Starting stale queue entry purge cycle -[!] [purge] Purged stale pending VM 'pooler-sad-dog' from 'centos-7-x86_64' (age: 7250s) -[!] [purge] Moved stale ready VM 'pooler-angry-cat' from 'ubuntu-2004-x86_64' to completed (age: 90000s) -[*] [purge] Completed purge cycle in 2.34s: 12 entries purged -``` - -### 3. Health Checks - -Monitors queue health and exposes metrics for alerting and dashboards. - -**What gets monitored:** -- Queue sizes (pending, ready, completed) -- Queue ages (oldest VM, average age) -- Stuck VMs (VMs in pending queue longer than threshold) -- DLQ size -- Orphaned metadata count -- Task queue sizes (clone, on-demand) -- Overall health status (healthy/degraded/unhealthy) - -**Benefits:** -- Proactive detection of queue issues -- Metrics for alerting and dashboards -- Historical health tracking -- API endpoint for health status - -**Configuration:** -```yaml -:config: - health_check_enabled: true - health_check_interval: 300 # seconds (5 minutes) - - health_thresholds: - pending_queue_max: 100 - ready_queue_max: 500 - dlq_max_warning: 100 - dlq_max_critical: 1000 - stuck_vm_age_threshold: 7200 # 2 hours - stuck_vm_max_warning: 10 - stuck_vm_max_critical: 50 -``` - -**Health Status Levels:** -- **Healthy**: All metrics within normal thresholds -- **Degraded**: Some metrics elevated but functional (DLQ > warning, queue sizes elevated) -- **Unhealthy**: Critical thresholds exceeded (DLQ > critical, many stuck VMs, queues backed up) - -**Viewing Health Status:** - -Via Redis: -```bash -# Get current health status -redis-cli HGETALL vmpooler__health - -# Get specific health metric -redis-cli HGET vmpooler__health status -redis-cli HGET vmpooler__health last_check -``` - -Via Logs: -``` -[*] [health] Status: HEALTHY | Queues: P=45 R=230 C=12 | DLQ=25 | Stuck=3 | Orphaned=5 -``` - -**Exposed Metrics:** - -The following metrics are pushed to the metrics system (Prometheus, Graphite, etc.): - -``` -# Health status (0=healthy, 1=degraded, 2=unhealthy) -vmpooler.health.status - -# Error metrics -vmpooler.health.dlq.total_size -vmpooler.health.stuck_vms.count -vmpooler.health.orphaned_metadata.count - -# Per-pool queue metrics -vmpooler.health.queue..pending.size -vmpooler.health.queue..pending.oldest_age -vmpooler.health.queue..pending.stuck_count -vmpooler.health.queue..ready.size -vmpooler.health.queue..ready.oldest_age -vmpooler.health.queue..completed.size - -# DLQ metrics -vmpooler.health.dlq..size - -# Task metrics -vmpooler.health.tasks.clone.active -vmpooler.health.tasks.ondemand.active -vmpooler.health.tasks.ondemand.pending -``` - -## Common Scenarios - -### Scenario 1: Investigating Failed VM Requests - -**Problem:** User reports VM request failed. - -**Steps:** -1. Check DLQ for the request: - ```bash - redis-cli ZRANGE vmpooler__dlq__pending 0 -1 | grep "req-abc123" - redis-cli ZRANGE vmpooler__dlq__clone 0 -1 | grep "req-abc123" - ``` - -2. Parse the JSON entry to see failure details: - ```bash - redis-cli ZRANGE vmpooler__dlq__clone 0 -1 | grep "req-abc123" | jq . - ``` - -3. Common failure reasons: - - `template does not exist` - Template missing or renamed in provider - - `permission denied` - VMPooler lacks permissions to clone template - - `timeout` - VM failed to become ready within timeout period - - `failed to obtain IP` - Network/DHCP issue - -### Scenario 2: Queue Backup - -**Problem:** Pending queue growing, VMs not moving to ready. - -**Steps:** -1. Check health status: - ```bash - redis-cli HGET vmpooler__health status - ``` - -2. Check pending queue metrics: - ```bash - # View stuck VMs - redis-cli HGET vmpooler__health stuck_vm_count - - # Check oldest VM age - redis-cli SMEMBERS vmpooler__pending__centos-7-x86_64 | head -1 | xargs -I {} redis-cli HGET vmpooler__vm__{} clone - ``` - -3. Check DLQ for recent failures: - ```bash - redis-cli ZREVRANGE vmpooler__dlq__clone 0 9 - ``` - -4. Common causes: - - Provider errors (vCenter unreachable, no resources) - - Network issues (can't reach VMs, no DHCP) - - Configuration issues (wrong template name, bad credentials) - -### Scenario 3: High DLQ Size - -**Problem:** DLQ size growing, indicating persistent failures. - -**Steps:** -1. Check DLQ size: - ```bash - redis-cli ZCARD vmpooler__dlq__pending - redis-cli ZCARD vmpooler__dlq__clone - ``` - -2. Identify common failure patterns: - ```bash - redis-cli ZRANGE vmpooler__dlq__clone 0 -1 | jq -r '.error_message' | sort | uniq -c | sort -rn - ``` - -3. Fix underlying issues (template exists, permissions, network) - -4. If issues resolved, DLQ entries will expire after TTL (default 7 days) - -### Scenario 4: Testing Configuration Changes - -**Problem:** Want to test new purge thresholds without affecting production. - -**Steps:** -1. Enable dry-run mode: - ```yaml - :config: - purge_dry_run: true - max_pending_age: 3600 # Test with 1 hour - ``` - -2. Monitor logs for purge detections: - ```bash - tail -f vmpooler.log | grep "purge.*dry-run" - ``` - -3. Verify detection is correct - -4. Disable dry-run when ready: - ```yaml - :config: - purge_dry_run: false - ``` - -### Scenario 5: Alerting on Queue Health - -**Problem:** Want to be notified when queues are unhealthy. - -**Steps:** -1. Set up Prometheus alerts based on health metrics: - ```yaml - - alert: VMPoolerUnhealthy - expr: vmpooler_health_status >= 2 - for: 10m - annotations: - summary: "VMPooler is unhealthy" - - - alert: VMPoolerHighDLQ - expr: vmpooler_health_dlq_total_size > 500 - for: 30m - annotations: - summary: "VMPooler DLQ size is high" - - - alert: VMPoolerStuckVMs - expr: vmpooler_health_stuck_vms_count > 20 - for: 15m - annotations: - summary: "Many VMs stuck in pending queue" - ``` - -## Troubleshooting - -### DLQ Not Capturing Failures - -**Check:** -1. Is DLQ enabled? `redis-cli HGET vmpooler__config dlq_enabled` -2. Are failures actually occurring? Check logs for error messages -3. Is Redis accessible? `redis-cli PING` - -### Purge Not Running - -**Check:** -1. Is purge enabled? Check config `purge_enabled: true` -2. Check logs for purge thread startup: `[*] [purge] Starting stale queue entry purge cycle` -3. Is purge interval too long? Default is 1 hour -4. Check thread status in logs: `[!] [queue_purge] worker thread died` - -### Health Check Not Updating - -**Check:** -1. Is health check enabled? Check config `health_check_enabled: true` -2. Check last update time: `redis-cli HGET vmpooler__health last_check` -3. Check logs for health check runs: `[*] [health] Status:` -4. Check thread status: `[!] [health_check] worker thread died` - -### Metrics Not Appearing - -**Check:** -1. Is metrics system configured? Check `:statsd` or `:graphite` config -2. Are metrics being sent? Check logs for metric sends -3. Check firewall/network to metrics server -4. Test metrics manually: `redis-cli HGETALL vmpooler__health` - -## Best Practices - -### Development/Testing Environments -- Enable DLQ with shorter TTL (24-48 hours) -- Enable purge with dry-run mode initially -- Use aggressive purge thresholds (30min pending, 6hr ready) -- Enable health checks with 1-minute interval -- Monitor logs closely for issues - -### Production Environments -- Enable DLQ with 7-day TTL -- Enable purge after testing in dev -- Use conservative purge thresholds (2hr pending, 24hr ready) -- Enable health checks with 5-minute interval -- Set up alerting based on health metrics -- Monitor DLQ size and set alerts (>500 = investigate) - -### Capacity Planning -- Monitor queue sizes during peak times -- Adjust thresholds based on actual usage patterns -- Review DLQ entries weekly for systemic issues -- Track purge counts to identify resource leaks - -### Debugging -- Keep DLQ TTL long enough for investigation (7+ days) -- Use dry-run mode when testing threshold changes -- Correlate DLQ entries with provider logs -- Check health metrics before and after changes - -## Migration Guide - -### Enabling Features in Existing Deployment - -1. **Phase 1: Enable DLQ** - - Add DLQ config with conservative TTL - - Monitor DLQ size and entry patterns - - Verify no performance impact - - Adjust TTL as needed - -2. **Phase 2: Enable Health Checks** - - Add health check config - - Verify metrics are exposed - - Set up dashboards - - Configure alerting - -3. **Phase 3: Enable Purge (Dry-Run)** - - Add purge config with `purge_dry_run: true` - - Monitor logs for purge detections - - Verify thresholds are appropriate - - Adjust thresholds based on observations - -4. **Phase 4: Enable Purge (Live)** - - Set `purge_dry_run: false` - - Monitor queue sizes and purge counts - - Watch for unexpected VM removal - - Adjust thresholds if needed - -## Performance Considerations - -- **DLQ**: Minimal overhead, uses Redis sorted sets -- **Purge**: Runs in background thread, iterates through queues -- **Health Checks**: Lightweight, caches metrics between runs - -Expected impact: -- Redis memory: +1-5MB for DLQ (depends on DLQ size) -- CPU: +1-2% during purge/health check cycles -- Network: Minimal, only metric pushes - -## Support - -For issues or questions: -1. Check logs for error messages -2. Review DLQ entries for failure patterns -3. Check health status and metrics -4. Open issue on GitHub with logs and config - diff --git a/REDIS_QUEUE_RELIABILITY.md b/REDIS_QUEUE_RELIABILITY.md deleted file mode 100644 index a8f7afe..0000000 --- a/REDIS_QUEUE_RELIABILITY.md +++ /dev/null @@ -1,362 +0,0 @@ -# Redis Queue Reliability Features - -## Overview -This document describes the implementation of dead-letter queues (DLQ), auto-purge mechanisms, and health checks for VMPooler Redis queues. - -## Background - -### Current Queue Structure -VMPooler uses Redis sets and sorted sets for queue management: - -- **Pool Queues** (Sets): `vmpooler__pending__#{pool}`, `vmpooler__ready__#{pool}`, `vmpooler__running__#{pool}`, `vmpooler__completed__#{pool}`, `vmpooler__discovered__#{pool}`, `vmpooler__migrating__#{pool}` -- **Task Queues** (Sorted Sets): `vmpooler__odcreate__task` (on-demand creation tasks), `vmpooler__provisioning__processing` -- **Task Queues** (Sets): `vmpooler__tasks__disk`, `vmpooler__tasks__snapshot`, `vmpooler__tasks__snapshot-revert` -- **VM Metadata** (Hashes): `vmpooler__vm__#{vm}` - contains clone time, IP, template, pool, domain, request_id, pool_alias, error details -- **Request Metadata** (Hashes): `vmpooler__odrequest__#{request_id}` - contains status, retry_count, token info - -### Current Error Handling -- Permanent errors (e.g., template not found) are detected in `_clone_vm` rescue block -- Failed VMs are removed from pending queue -- Request status is set to 'failed' and re-queue is prevented in outer `clone_vm` rescue block -- VM metadata expires after data_ttl hours - -### Problem Areas -1. **Lost visibility**: Failed messages are removed but no centralized tracking -2. **Stale data**: VMs stuck in queues due to process crashes or bugs -3. **No monitoring**: No automated way to detect queue health issues -4. **Manual cleanup**: Operators must manually identify and clean stale entries - -## Feature Requirements - -### 1. Dead-Letter Queue (DLQ) - -#### Purpose -Capture failed VM creation requests for visibility, debugging, and potential retry/recovery. - -#### Design - -**DLQ Structure:** -``` -vmpooler__dlq__pending # Failed pending VMs (sorted set, scored by failure timestamp) -vmpooler__dlq__clone # Failed clone operations (sorted set) -vmpooler__dlq__ready # Failed ready queue VMs (sorted set) -vmpooler__dlq__tasks # Failed tasks (hash of task_type -> failed items) -``` - -**DLQ Entry Format:** -```json -{ - "vm": "vm-name-abc123", - "pool": "pool-name", - "queue_from": "pending", - "error_class": "StandardError", - "error_message": "template does not exist", - "failed_at": "2024-01-15T10:30:00Z", - "retry_count": 3, - "request_id": "req-123456", - "pool_alias": "centos-7" -} -``` - -**Configuration:** -```yaml -:redis: - dlq_enabled: true - dlq_ttl: 168 # hours (7 days) - dlq_max_entries: 10000 # per DLQ queue -``` - -**Implementation Points:** -- `fail_pending_vm`: Move to DLQ when VM fails during pending checks -- `_clone_vm` rescue: Move to DLQ on clone failure -- `_check_ready_vm`: Move to DLQ when ready VM becomes unreachable -- `_destroy_vm` rescue: Log destroy failures to DLQ - -**Acceptance Criteria:** -- [ ] Failed VMs are automatically moved to appropriate DLQ -- [ ] DLQ entries contain complete failure context (error, timestamp, retry count) -- [ ] DLQ entries expire after configurable TTL -- [ ] DLQ size is limited to prevent unbounded growth -- [ ] DLQ entries are queryable via Redis CLI or API - -### 2. Auto-Purge Mechanism - -#### Purpose -Automatically remove stale entries from queues to prevent resource leaks and improve queue health. - -#### Design - -**Purge Targets:** -1. **Pending VMs**: Stuck in pending > max_pending_age (e.g., 2 hours) -2. **Ready VMs**: Idle in ready queue > max_ready_age (e.g., 24 hours for on-demand, 48 hours for pool) -3. **Completed VMs**: In completed queue > max_completed_age (e.g., 1 hour) -4. **Orphaned VM Metadata**: VM hash exists but VM not in any queue -5. **Expired Requests**: On-demand requests > max_request_age (e.g., 24 hours) - -**Configuration:** -```yaml -:config: - purge_enabled: true - purge_interval: 3600 # seconds (1 hour) - max_pending_age: 7200 # seconds (2 hours) - max_ready_age: 86400 # seconds (24 hours) - max_completed_age: 3600 # seconds (1 hour) - max_orphaned_age: 86400 # seconds (24 hours) - max_request_age: 86400 # seconds (24 hours) - purge_dry_run: false # if true, log what would be purged but don't purge -``` - -**Purge Process:** -1. Scan each queue for stale entries (based on age thresholds) -2. Check if VM still exists in provider (optional validation) -3. Move stale entries to DLQ with reason -4. Remove from original queue -5. Log purge metrics - -**Implementation:** -- New method: `purge_stale_queue_entries` - main purge loop -- Helper methods: `check_pending_age`, `check_ready_age`, `check_completed_age`, `find_orphaned_metadata` -- Scheduled task: Run every `purge_interval` seconds - -**Acceptance Criteria:** -- [ ] Stale pending VMs are detected and moved to DLQ -- [ ] Stale ready VMs are detected and moved to completed queue -- [ ] Stale completed VMs are removed from queue -- [ ] Orphaned VM metadata is detected and expired -- [ ] Purge metrics are logged (count, age, reason) -- [ ] Dry-run mode available for testing -- [ ] Purge runs on configurable interval - -### 3. Health Checks - -#### Purpose -Monitor Redis queue health and expose metrics for alerting and dashboards. - -#### Design - -**Health Metrics:** -```ruby -{ - queues: { - pending: { - pool_name: { - size: 10, - oldest_age: 3600, # seconds - avg_age: 1200, - stuck_count: 2 # VMs older than threshold - } - }, - ready: { ... }, - completed: { ... }, - dlq: { ... } - }, - tasks: { - clone: { active: 5, pending: 10 }, - ondemand: { active: 2, pending: 5 } - }, - processing_rate: { - clone_rate: 10.5, # VMs per minute - destroy_rate: 8.2 - }, - errors: { - dlq_size: 150, - stuck_vm_count: 5, - orphaned_metadata_count: 12 - }, - status: "healthy|degraded|unhealthy" -} -``` - -**Health Status Criteria:** -- **Healthy**: All queues within normal thresholds, DLQ size < 100, no stuck VMs -- **Degraded**: Some queues elevated but functional, DLQ size < 1000, few stuck VMs -- **Unhealthy**: Queues critically backed up, DLQ size > 1000, many stuck VMs - -**Configuration:** -```yaml -:config: - health_check_enabled: true - health_check_interval: 300 # seconds (5 minutes) - health_thresholds: - pending_queue_max: 100 - ready_queue_max: 500 - dlq_max_warning: 100 - dlq_max_critical: 1000 - stuck_vm_age_threshold: 7200 # 2 hours - stuck_vm_max_warning: 10 - stuck_vm_max_critical: 50 -``` - -**Implementation:** -- New method: `check_queue_health` - main health check -- Helper methods: `calculate_queue_metrics`, `calculate_processing_rate`, `determine_health_status` -- Expose via: - - Redis hash: `vmpooler__health` (for API consumption) - - Metrics: Push to existing $metrics system - - Logs: Periodic health summary in logs - -**Acceptance Criteria:** -- [ ] Queue sizes are monitored per pool -- [ ] Queue ages are calculated (oldest, average) -- [ ] Stuck VMs are detected (age > threshold) -- [ ] DLQ size is monitored -- [ ] Processing rates are calculated -- [ ] Overall health status is determined -- [ ] Health metrics are exposed via Redis, metrics, and logs -- [ ] Health check runs on configurable interval - -## Implementation Plan - -### Phase 1: Dead-Letter Queue -1. Add DLQ configuration parsing -2. Implement `move_to_dlq` helper method -3. Update `fail_pending_vm` to use DLQ -4. Update `_clone_vm` rescue block to use DLQ -5. Update `_check_ready_vm` to use DLQ -6. Add DLQ TTL enforcement -7. Add DLQ size limiting -8. Unit tests for DLQ operations - -### Phase 2: Auto-Purge -1. Add purge configuration parsing -2. Implement `purge_stale_queue_entries` main loop -3. Implement age-checking helper methods -4. Implement orphan detection -5. Add purge metrics logging -6. Add dry-run mode -7. Unit tests for purge logic -8. Integration test for full purge cycle - -### Phase 3: Health Checks -1. Add health check configuration parsing -2. Implement `check_queue_health` main method -3. Implement metric calculation helpers -4. Implement health status determination -5. Expose metrics via Redis hash -6. Expose metrics via $metrics system -7. Add periodic health logging -8. Unit tests for health check logic - -### Phase 4: Integration & Documentation -1. Update configuration examples -2. Update operator documentation -3. Update API documentation (if exposing health endpoint) -4. Add troubleshooting guide for DLQ/purge -5. Create runbook for operators -6. Update TESTING.md with DLQ/purge/health check testing - -## Migration & Rollout - -### Backward Compatibility -- All features are opt-in via configuration -- Default: `dlq_enabled: false`, `purge_enabled: false`, `health_check_enabled: false` -- Existing behavior unchanged when features disabled - -### Rollout Strategy -1. Deploy with features disabled -2. Enable DLQ first, monitor for issues -3. Enable health checks, validate metrics -4. Enable auto-purge in dry-run mode, validate detection -5. Enable auto-purge in live mode, monitor impact - -### Monitoring During Rollout -- Monitor DLQ growth rate -- Monitor purge counts and reasons -- Monitor health status changes -- Watch for unexpected VM removal -- Check for performance impact (Redis load, memory) - -## Testing Strategy - -### Unit Tests -- DLQ capture for various error scenarios -- DLQ TTL enforcement -- DLQ size limiting -- Age calculation for purge detection -- Orphan detection logic -- Health metric calculations -- Health status determination - -### Integration Tests -- End-to-end VM failure → DLQ flow -- End-to-end purge cycle -- Health check with real queue data -- DLQ + purge interaction (purge should respect DLQ entries) - -### Manual Testing -1. Create VM with invalid template → verify DLQ entry -2. Let VM sit in pending too long → verify purge detection -3. Check health endpoint → verify metrics accuracy -4. Run purge in dry-run → verify correct detection without deletion -5. Run purge in live mode → verify stale entries removed - -## API Changes (Optional) - -If exposing to API: -``` -GET /api/v1/queue/health -Returns: Health metrics JSON - -GET /api/v1/queue/dlq?queue=pending&limit=50 -Returns: DLQ entries for specified queue - -POST /api/v1/queue/purge?dry_run=true -Returns: Purge simulation results (admin only) -``` - -## Metrics - -New metrics to add: -``` -vmpooler.dlq.pending.size -vmpooler.dlq.clone.size -vmpooler.dlq.ready.size -vmpooler.dlq.tasks.size - -vmpooler.purge.pending.count -vmpooler.purge.ready.count -vmpooler.purge.completed.count -vmpooler.purge.orphaned.count - -vmpooler.health.status # 0=healthy, 1=degraded, 2=unhealthy -vmpooler.health.stuck_vms.count -vmpooler.health.queue.#{queue_name}.size -vmpooler.health.queue.#{queue_name}.oldest_age -``` - -## Configuration Example - -```yaml ---- -:config: - # Existing config... - - # Dead-Letter Queue - dlq_enabled: true - dlq_ttl: 168 # hours (7 days) - dlq_max_entries: 10000 - - # Auto-Purge - purge_enabled: true - purge_interval: 3600 # seconds (1 hour) - purge_dry_run: false - max_pending_age: 7200 # seconds (2 hours) - max_ready_age: 86400 # seconds (24 hours) - max_completed_age: 3600 # seconds (1 hour) - max_orphaned_age: 86400 # seconds (24 hours) - - # Health Checks - health_check_enabled: true - health_check_interval: 300 # seconds (5 minutes) - health_thresholds: - pending_queue_max: 100 - ready_queue_max: 500 - dlq_max_warning: 100 - dlq_max_critical: 1000 - stuck_vm_age_threshold: 7200 # 2 hours - stuck_vm_max_warning: 10 - stuck_vm_max_critical: 50 - -:redis: - # Existing redis config... -``` diff --git a/lib/vmpooler/metrics/promstats.rb b/lib/vmpooler/metrics/promstats.rb index f24f9b9..d0e1ab9 100644 --- a/lib/vmpooler/metrics/promstats.rb +++ b/lib/vmpooler/metrics/promstats.rb @@ -329,6 +329,30 @@ module Vmpooler buckets: REDIS_CONNECT_BUCKETS, docstring: 'vmpooler redis connection wait time', param_labels: %i[type provider] + }, + vmpooler_health: { + mtype: M_GAUGE, + torun: %i[manager], + docstring: 'vmpooler health check metrics', + param_labels: %i[metric_path] + }, + vmpooler_purge: { + mtype: M_GAUGE, + torun: %i[manager], + docstring: 'vmpooler purge metrics', + param_labels: %i[metric_path] + }, + vmpooler_destroy: { + mtype: M_GAUGE, + torun: %i[manager], + docstring: 'vmpooler destroy metrics', + param_labels: %i[poolname] + }, + vmpooler_clone: { + mtype: M_GAUGE, + torun: %i[manager], + docstring: 'vmpooler clone metrics', + param_labels: %i[poolname] } } end diff --git a/lib/vmpooler/pool_manager.rb b/lib/vmpooler/pool_manager.rb index a7f2ddd..e4f653d 100644 --- a/lib/vmpooler/pool_manager.rb +++ b/lib/vmpooler/pool_manager.rb @@ -200,11 +200,11 @@ module Vmpooler redis.hset("vmpooler__odrequest__#{request_id}", 'status', 'failed') redis.hset("vmpooler__odrequest__#{request_id}", 'failure_reason', failure_reason) $logger.log('s', "[!] [#{pool}] '#{vm}' permanently failed: #{failure_reason}") - $metrics.increment("errors.permanently_failed.#{pool}") + $metrics.increment("vmpooler_errors.permanently_failed.#{pool}") end end end - $metrics.increment("errors.markedasfailed.#{pool}") + $metrics.increment("vmpooler_errors.markedasfailed.#{pool}") open_socket_error || clone_error end @@ -477,7 +477,7 @@ module Vmpooler ttl_seconds = dlq_ttl * 3600 redis.expire(dlq_key, ttl_seconds) - $metrics.increment("dlq.#{queue_type}.count") unless skip_metrics + $metrics.increment("vmpooler_dlq.#{queue_type}.count") unless skip_metrics $logger.log('d', "[!] [dlq] Moved '#{vm}' from '#{queue_type}' queue to DLQ: #{error_message}") rescue StandardError => e $logger.log('s', "[!] [dlq] Failed to move '#{vm}' to DLQ: #{e}") @@ -551,10 +551,10 @@ module Vmpooler hostname_retries += 1 if !hostname_available - $metrics.increment("errors.duplicatehostname.#{pool_name}") + $metrics.increment("vmpooler_errors.duplicatehostname.#{pool_name}") $logger.log('s', "[!] [#{pool_name}] Generated hostname #{fqdn} was not unique (attempt \##{hostname_retries} of #{max_hostname_retries})") elsif !dns_available - $metrics.increment("errors.staledns.#{pool_name}") + $metrics.increment("vmpooler_errors.staledns.#{pool_name}") $logger.log('s', "[!] [#{pool_name}] Generated hostname #{fqdn} already exists in DNS records (#{dns_ip}), stale DNS") end end @@ -600,7 +600,7 @@ module Vmpooler provider.create_vm(pool_name, new_vmname) finish = format('%