From ada79e81f4406ac44f8deb3a52ffff113268c62a Mon Sep 17 00:00:00 2001 From: "kirby@puppetlabs.com" Date: Fri, 13 Oct 2017 17:56:22 -0700 Subject: [PATCH] (QENG-5305) Check cluster utilization once at a time This commit adds a global provider_hosts concept in order to allow checking cluster utilization once per interval for a given cluster and retain the results, reusing them for an interval, and tracking the least used set of hosts. Without this change each migration and clone operation inspect host utilization and state for each host in the cluster, which is computationally expensive for vsphere. --- lib/vmpooler/pool_manager.rb | 113 +++++++++++++++++++++++++++++------ 1 file changed, 95 insertions(+), 18 deletions(-) diff --git a/lib/vmpooler/pool_manager.rb b/lib/vmpooler/pool_manager.rb index 1b630ae..dc69d25 100644 --- a/lib/vmpooler/pool_manager.rb +++ b/lib/vmpooler/pool_manager.rb @@ -21,6 +21,9 @@ module Vmpooler # Our thread-tracker object $threads = {} + + # Host tracking object + $provider_hosts = {} end def config @@ -459,6 +462,77 @@ module Vmpooler end end + def get_provider_name(pool_name, config = $config) + pool = config[:pools].select { |p| p['name'] == pool_name }[0] + provider_name = pool['provider'] if pool.key?('provider') + provider_name = config[:providers].first[0].to_s if provider_name.nil? and config.key?(:providers) + provider_name = 'default' if provider_name.nil? + provider_name + end + + def get_cluster(pool_name) + default_cluster = $config[:config]['clone_target'] if $config[:config].key?('clone_target') + default_datacenter = $config[:config]['datacenter'] if $config[:config].key?('datacenter') + pool = $config[:pools].select { |p| p['name'] == pool_name }[0] + cluster = pool['clone_target'] if pool.key?('clone_target') + cluster = default_cluster if cluster.nil? + datacenter = pool['datacenter'] if pool.key?('datacenter') + datacenter = default_datacenter if datacenter.nil? + return if cluster.nil? + return if datacenter.nil? + { 'cluster' => cluster, 'datacenter' => datacenter } + end + + def select_hosts(pool_name, provider, provider_name, cluster, datacenter, percentage) + $provider_hosts[provider_name] = {} unless $provider_hosts.key?(provider_name) + $provider_hosts[provider_name][datacenter] = {} unless $provider_hosts[provider_name].key?(datacenter) + $provider_hosts[provider_name][datacenter][cluster] = {} unless $provider_hosts[provider_name][datacenter].key?(cluster) + $provider_hosts[provider_name][datacenter][cluster]['checking'] = true + hosts_hash = provider.select_target_hosts(cluster, datacenter, percentage) + $provider_hosts[provider_name][datacenter][cluster] = hosts_hash + $provider_hosts[provider_name][datacenter][cluster]['check_time_finished'] = Time.now + end + + def run_select_hosts(provider, pool_name, provider_name, cluster, datacenter, max_age, percentage) + now = Time.now + if $provider_hosts.key?(provider_name) and $provider_hosts[provider_name].key?(datacenter) and $provider_hosts[provider_name][datacenter].key?(cluster) and $provider_hosts[provider_name][datacenter][cluster].key?('checking') + wait_for_host_selection(pool_name, provider_name, cluster, datacenter) + elsif $provider_hosts.key?(provider_name) and $provider_hosts[provider_name].key?(datacenter) and $provider_hosts[provider_name][datacenter].key?(cluster) and $provider_hosts[provider_name][datacenter][cluster].key?('check_time_finished') + select_hosts(pool_name, provider, provider_name, cluster, datacenter, percentage) if now - $provider_hosts[provider_name][datacenter][cluster]['check_time_finished'] > max_age + else + select_hosts(pool_name, provider, provider_name, cluster, datacenter, percentage) + end + end + + def wait_for_host_selection(pool_name, provider_name, cluster, datacenter, maxloop = 0, loop_delay = 5, max_age = 60) + loop_count = 1 + until $provider_hosts[provider_name][datacenter][cluster].key?('check_time_finished') + sleep(loop_delay) + unless maxloop.zero? + break if loop_count >= maxloop + loop_count += 1 + end + end + return unless $provider_hosts[provider_name][datacenter][cluster].key?('check_time_finished') + loop_count = 1 + while Time.now - $provider_hosts[provider_name][datacenter][cluster]['check_time_finished'] > max_age + sleep(loop_delay) + unless maxloop.zero? + break if loop_count >= maxloop + loop_count += 1 + end + end + end + + def select_next_host(provider_name, datacenter, cluster, architecture) + provider_hosts = $provider_hosts + host = provider_hosts[provider_name][datacenter][cluster]['architectures'][architecture][0] + return if host.nil? + provider_hosts[provider_name][datacenter][cluster]['architectures'][architecture].delete(host) + provider_hosts[provider_name][datacenter][cluster]['architectures'][architecture] << host + host + end + def migration_limit(migration_limit) # Returns migration_limit setting when enabled return false if migration_limit == 0 || !migration_limit # rubocop:disable Style/NumericPredicate @@ -468,7 +542,7 @@ module Vmpooler def migrate_vm(vm_name, pool_name, provider) Thread.new do begin - _migrate_vm(vm_name, pool_name, provider) + _migrate_vm(vm_name, pool_name, provider, $provider_hosts) rescue => err $logger.log('s', "[x] [#{pool_name}] '#{vm_name}' migration failed with an error: #{err}") remove_vmpooler_migration_vm(pool_name, vm_name) @@ -477,29 +551,32 @@ module Vmpooler end def _migrate_vm(vm_name, pool_name, provider) - $redis.srem('vmpooler__migrating__' + pool_name, vm_name) + $redis.srem("vmpooler__migrating__#{pool_name}", vm_name) - parent_host_name = provider.get_vm_host(pool_name, vm_name) - raise('Unable to determine which host the VM is running on') if parent_host_name.nil? + provider_name = get_provider_name(pool_name) + vm = provider.get_vm_details(pool_name, vm_name) + raise('Unable to determine which host the VM is running on') if vm['host'].nil? migration_limit = migration_limit $config[:config]['migration_limit'] migration_count = $redis.scard('vmpooler__migration') - if !migration_limit - $logger.log('s', "[ ] [#{pool_name}] '#{vm_name}' is running on #{parent_host_name}") - return - elsif migration_count >= migration_limit - $logger.log('s', "[ ] [#{pool_name}] '#{vm_name}' is running on #{parent_host_name}. No migration will be evaluated since the migration_limit has been reached") - return - else - $redis.sadd('vmpooler__migration', vm_name) - host_name = provider.find_least_used_compatible_host(pool_name, vm_name) - if host_name == parent_host_name - $logger.log('s', "[ ] [#{pool_name}] No migration required for '#{vm_name}' running on #{parent_host_name}") + if migration_limit + max_age = 60 + percentage_of_hosts_below_average = 100 + run_select_hosts(provider, pool_name, provider_name, vm['cluster'], vm['datacenter'], max_age, percentage_of_hosts_below_average) + if migration_count >= migration_limit + $logger.log('s', "[ ] [#{pool_name}] '#{vm_name}' is running on #{vm['host']}. No migration will be evaluated since the migration_limit has been reached") + elsif $provider_hosts[provider_name][vm['datacenter']][vm['cluster']]['architectures'][vm['architecture']].include?(vm['host']) + $logger.log('s', "[ ] [#{pool_name}] No migration required for '#{vm_name}' running on #{vm['host']}") else - finish = migrate_vm_and_record_timing(vm_name, pool_name, parent_host_name, host_name, provider) - $logger.log('s', "[>] [#{pool_name}] '#{vm_name}' migrated from #{parent_host_name} to #{host_name} in #{finish} seconds") + $redis.sadd('vmpooler__migration', vm_name) + target_host_name = select_next_host(provider_name, vm['datacenter'], vm['cluster'], vm['architecture']) + finish = migrate_vm_and_record_timing(vm_name, pool_name, vm['host'], target_host_name, provider) + $logger.log('s', "[>] [#{pool_name}] '#{vm_name}' migrated from #{vm['host']} to #{target_host_name} in #{finish} seconds") + remove_vmpooler_migration_vm(pool_name, vm_name) end - remove_vmpooler_migration_vm(pool_name, vm_name) + return + else + $logger.log('s', "[ ] [#{pool_name}] '#{vm_name}' is running on #{vm['host']}") end end