From cd6ebae825837c83d9a280b2249bd5e0232b6591 Mon Sep 17 00:00:00 2001 From: Samuel Beaulieu Date: Tue, 19 Jul 2022 12:35:43 -0500 Subject: [PATCH 01/11] adding more error handling when doing the net::ssh aws_setup portion --- lib/vmpooler/aws_setup.rb | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/lib/vmpooler/aws_setup.rb b/lib/vmpooler/aws_setup.rb index 90a0825..1beb3d0 100644 --- a/lib/vmpooler/aws_setup.rb +++ b/lib/vmpooler/aws_setup.rb @@ -13,7 +13,7 @@ module Vmpooler def self.setup_node_by_ssh(host, platform) @key_file = ENV['AWS_KEY_FILE_LOCATION'] conn = check_ssh_accepting_connections(host, platform) - configure_host(host, platform, conn) + configure_host(host, platform, conn) if conn end # For an Amazon Linux AMI, the user name is ec2-user. @@ -49,9 +49,18 @@ module Vmpooler netssh_jruby_workaround Net::SSH.start(host, user, keys: @key_file, timeout: 10) rescue Net::SSH::ConnectionTimeout, Errno::ECONNREFUSED => e - puts "Requested instances do not have sshd ready yet, try again: #{e}" + puts "Requested instances do not have sshd ready yet, try again for 300s: #{e}" sleep 1 retry if (retries += 1) < 300 + rescue Errno::EBADF => e + puts "Jruby error, try again for 30s: #{e}" + puts e.backtrace + sleep 1 + retry if (retries += 1) < 30 + rescue StandardError => e + puts "Other error, cancelling aws_setup for #{host}: #{e}" + puts e.backtrace + return nil end end From ea36218120da635854210b48fb453b2699b7a8c4 Mon Sep 17 00:00:00 2001 From: Samuel Beaulieu Date: Tue, 19 Jul 2022 16:17:10 -0500 Subject: [PATCH 02/11] trying to debug net:ssh --- lib/vmpooler/aws_setup.rb | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/lib/vmpooler/aws_setup.rb b/lib/vmpooler/aws_setup.rb index 1beb3d0..2f084fa 100644 --- a/lib/vmpooler/aws_setup.rb +++ b/lib/vmpooler/aws_setup.rb @@ -13,7 +13,11 @@ module Vmpooler def self.setup_node_by_ssh(host, platform) @key_file = ENV['AWS_KEY_FILE_LOCATION'] conn = check_ssh_accepting_connections(host, platform) - configure_host(host, platform, conn) if conn + if conn + puts "#{host} connected" + configure_host(host, platform, conn) + puts "#{host} configured" + end end # For an Amazon Linux AMI, the user name is ec2-user. @@ -49,16 +53,15 @@ module Vmpooler netssh_jruby_workaround Net::SSH.start(host, user, keys: @key_file, timeout: 10) rescue Net::SSH::ConnectionTimeout, Errno::ECONNREFUSED => e - puts "Requested instances do not have sshd ready yet, try again for 300s: #{e}" + puts "#{host} Requested instances do not have sshd ready yet, try again for 300s (#{retries}): #{e}" sleep 1 retry if (retries += 1) < 300 rescue Errno::EBADF => e - puts "Jruby error, try again for 30s: #{e}" - puts e.backtrace + puts "#{host} Jruby error, try again for 30s (#{retries}): #{e}" sleep 1 retry if (retries += 1) < 30 rescue StandardError => e - puts "Other error, cancelling aws_setup for #{host}: #{e}" + puts "#{host} Other error, cancelling aws_setup: #{e}" puts e.backtrace return nil end From caecd112603b5e4fc56b69edf3aaa7a5bad7cdf1 Mon Sep 17 00:00:00 2001 From: Samuel Beaulieu Date: Wed, 20 Jul 2022 15:17:45 -0500 Subject: [PATCH 03/11] retry longer for the Jruby error --- lib/vmpooler/aws_setup.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/vmpooler/aws_setup.rb b/lib/vmpooler/aws_setup.rb index 2f084fa..9e0d9c5 100644 --- a/lib/vmpooler/aws_setup.rb +++ b/lib/vmpooler/aws_setup.rb @@ -57,8 +57,8 @@ module Vmpooler sleep 1 retry if (retries += 1) < 300 rescue Errno::EBADF => e - puts "#{host} Jruby error, try again for 30s (#{retries}): #{e}" - sleep 1 + puts "#{host} Jruby error, try again for 300s (#{retries}): #{e}" + sleep 10 retry if (retries += 1) < 30 rescue StandardError => e puts "#{host} Other error, cancelling aws_setup: #{e}" From 0d7923ed4d3b5eeacd642b732e9fae30e49afb69 Mon Sep 17 00:00:00 2001 From: Samuel Beaulieu Date: Thu, 21 Jul 2022 11:44:24 -0500 Subject: [PATCH 04/11] fix logging, add proper aws_setup class, add a name for the VM which is based on the current site_name --- lib/vmpooler/aws_setup.rb | 29 +++++++++++++++++------------ lib/vmpooler/providers/ec2.rb | 17 ++++++++++++----- 2 files changed, 29 insertions(+), 17 deletions(-) diff --git a/lib/vmpooler/aws_setup.rb b/lib/vmpooler/aws_setup.rb index 9e0d9c5..9b537f9 100644 --- a/lib/vmpooler/aws_setup.rb +++ b/lib/vmpooler/aws_setup.rb @@ -10,13 +10,18 @@ module Vmpooler ROOT_KEYS_SCRIPT = ENV['ROOT_KEYS_SCRIPT'] ROOT_KEYS_SYNC_CMD = "curl -k -o - -L #{ROOT_KEYS_SCRIPT} | %s" - def self.setup_node_by_ssh(host, platform) + def initialize(logger, new_vmname) + @logger = logger @key_file = ENV['AWS_KEY_FILE_LOCATION'] + @vm_name = new_vmname + end + + def setup_node_by_ssh(host, platform) conn = check_ssh_accepting_connections(host, platform) if conn - puts "#{host} connected" + @logger.log('s', "[>] [#{platform}] '#{@vm_name}' net:ssh connected") configure_host(host, platform, conn) - puts "#{host} configured" + @logger.log('s', "[>] [#{platform}] '#{@vm_name}' configured") end end @@ -34,7 +39,7 @@ module Vmpooler # # For an Ubuntu AMI, the user name is ubuntu. - def self.get_user(platform) + def get_user(platform) if platform =~ /centos/ 'centos' elsif platform =~ /ubuntu/ @@ -46,22 +51,22 @@ module Vmpooler end end - def self.check_ssh_accepting_connections(host, platform) + def check_ssh_accepting_connections(host, platform) retries = 0 begin user = get_user(platform) netssh_jruby_workaround Net::SSH.start(host, user, keys: @key_file, timeout: 10) rescue Net::SSH::ConnectionTimeout, Errno::ECONNREFUSED => e - puts "#{host} Requested instances do not have sshd ready yet, try again for 300s (#{retries}): #{e}" + @logger.log('s', "[>] [#{platform}] '#{@vm_name}' net:ssh requested instances do not have sshd ready yet, try again for 300s (#{retries}/300): #{e}") sleep 1 retry if (retries += 1) < 300 rescue Errno::EBADF => e - puts "#{host} Jruby error, try again for 300s (#{retries}): #{e}" + @logger.log('s', "[>] [#{platform}] '#{@vm_name}' net:ssh jruby error, try again for 300s (#{retries}/30): #{e}") sleep 10 retry if (retries += 1) < 30 rescue StandardError => e - puts "#{host} Other error, cancelling aws_setup: #{e}" + @logger.log('s', "[>] [#{platform}] '#{@vm_name}' net:ssh other error, skipping aws_setup: #{e}") puts e.backtrace return nil end @@ -69,7 +74,7 @@ module Vmpooler # Configure the aws host by enabling root and setting the hostname # @param host [String] the internal dns name of the instance - def self.configure_host(host, platform, ssh) + def configure_host(host, platform, ssh) ssh.exec!('sudo cp -r .ssh /root/.') ssh.exec!("sudo sed -ri 's/^#?PermitRootLogin.*/PermitRootLogin yes/' /etc/ssh/sshd_config") ssh.exec!("sudo hostname #{host}") @@ -81,7 +86,7 @@ module Vmpooler sync_root_keys(host, platform) end - def self.restart_sshd(host, platform, ssh) + def restart_sshd(host, platform, ssh) ssh.open_channel do |channel| channel.request_pty do |ch, success| raise "can't get pty request" unless success @@ -100,7 +105,7 @@ module Vmpooler ssh.loop end - def self.sync_root_keys(host, _platform) + def sync_root_keys(host, _platform) return if ROOT_KEYS_SCRIPT.nil? user = 'root' @@ -113,7 +118,7 @@ module Vmpooler # issue when using net ssh 6.1.0 with jruby # https://github.com/jruby/jruby-openssl/issues/105 # this will turn off some algos that match /^ecd(sa|h)-sha2/ - def self.netssh_jruby_workaround + def netssh_jruby_workaround Net::SSH::Transport::Algorithms::ALGORITHMS.each_value { |algs| algs.reject! { |a| a =~ /^ecd(sa|h)-sha2/ } } Net::SSH::KnownHosts::SUPPORTED_TYPE.reject! { |t| t =~ /^ecd(sa|h)-sha2/ } end diff --git a/lib/vmpooler/providers/ec2.rb b/lib/vmpooler/providers/ec2.rb index 4a7c26c..b7b0721 100644 --- a/lib/vmpooler/providers/ec2.rb +++ b/lib/vmpooler/providers/ec2.rb @@ -31,6 +31,7 @@ module Vmpooler # The default connection pool timeout should be quite large - 60 seconds connpool_timeout = provider_config['connection_pool_timeout'].nil? ? 60 : provider_config['connection_pool_timeout'].to_i logger.log('d', "[#{name}] ConnPool - Creating a connection pool of size #{connpool_size} with timeout #{connpool_timeout}") + @logger = logger @connection_pool = Vmpooler::PoolManager::GenericConnectionPool.new( metrics: metrics, connpool_type: 'provider_connection_pool', @@ -223,10 +224,15 @@ module Vmpooler key: 'portfolio', value: 'ds-ci' } - ] } ] + if global_config[:config] && global_config[:config]['site_name'] + tag.first[:tags] << { + key: 'Name', + value: global_config[:config]['site_name'] + } + end config = { min_count: 1, max_count: 1, @@ -250,13 +256,14 @@ module Vmpooler created_instance = get_vm(pool_name, new_vmname) # extra setup steps - provision_node_aws(created_instance['private_dns_name'], pool_name) if to_provision(pool_name) == 'true' || to_provision(pool_name) == true + provision_node_aws(created_instance['private_dns_name'], pool_name, new_vmname) if to_provision(pool_name) == 'true' || to_provision(pool_name) == true created_instance end - def provision_node_aws(vm, pool_name) - AwsSetup.setup_node_by_ssh(vm, pool_name) + def provision_node_aws(vm, pool_name, new_vmname) + aws_setup = AwsSetup.new(@logger, new_vmname) + aws_setup.setup_node_by_ssh(vm, pool_name) end def get_block_device_mappings(image_id, volume_size) @@ -511,7 +518,7 @@ module Vmpooler def debug_logger(message, send_to_upstream: false) # the default logger is simple and does not enforce debug levels (the first argument) puts message if ENV['DEBUG_FLAG'] - logger.log('[g]', message) if send_to_upstream + @logger.log('[g]', message) if send_to_upstream end end end From 4e85886da4985893b44e6f6983f13021f381738e Mon Sep 17 00:00:00 2001 From: Samuel Beaulieu Date: Fri, 22 Jul 2022 13:37:13 -0500 Subject: [PATCH 05/11] wait until instance reports being status ready (aws checks that it is reachable) which fixes a lot of the net:ssh issues for provisioning --- lib/vmpooler/providers/ec2.rb | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/lib/vmpooler/providers/ec2.rb b/lib/vmpooler/providers/ec2.rb index b7b0721..14a68f2 100644 --- a/lib/vmpooler/providers/ec2.rb +++ b/lib/vmpooler/providers/ec2.rb @@ -253,6 +253,15 @@ module Vmpooler batch_instance = connection.create_instances(config) instance_id = batch_instance.first.instance_id connection.client.wait_until(:instance_running, { instance_ids: [instance_id] }) + @logger.log('s', "[>] [#{pool_name}] '#{new_vmname}' instance running") + ### System status checks + # This check verifies that your instance is reachable. Amazon EC2 tests that network packets can get to your instance. + # If this check fails, there might be an issue with the infrastructure that is hosting your instance (such as AWS power, networking, or software systems). You can restart or replace the instance, wait for Amazon EC2’s systems to resolve the issue, or seek technical support. + # This check does not validate that your operating system and applications are accepting traffic. + ### Instance status checks + # This check verifies that your instance's operating system is accepting traffic. + connection.client.wait_until(:instance_status_ok, { instance_ids: [instance_id] }) + @logger.log('s', "[>] [#{pool_name}] '#{new_vmname}' instance ready to accept traffic") created_instance = get_vm(pool_name, new_vmname) # extra setup steps From 28c5331e38a7a383b1faf3518810e352096431b8 Mon Sep 17 00:00:00 2001 From: Samuel Beaulieu Date: Fri, 22 Jul 2022 14:16:28 -0500 Subject: [PATCH 06/11] puts for spec logging --- spec/helpers.rb | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spec/helpers.rb b/spec/helpers.rb index 4b2dff6..eefcb04 100644 --- a/spec/helpers.rb +++ b/spec/helpers.rb @@ -10,7 +10,9 @@ end # Mock an object which represents a Logger. This stops the proliferation # of allow(logger).to .... expectations in tests. class MockLogger - def log(_level, string); end + def log(_level, string); + puts "#{string}" + end end def expect_json(ok = true, http = 200) From b178aba8831a3b97fc4280f194a7b6cd938dd713 Mon Sep 17 00:00:00 2001 From: Samuel Beaulieu Date: Fri, 22 Jul 2022 14:43:37 -0500 Subject: [PATCH 07/11] log vm_ready? error from provider whn vmpooler cannot tcp to port 22 --- lib/vmpooler/providers/ec2.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/vmpooler/providers/ec2.rb b/lib/vmpooler/providers/ec2.rb index 14a68f2..e084994 100644 --- a/lib/vmpooler/providers/ec2.rb +++ b/lib/vmpooler/providers/ec2.rb @@ -395,7 +395,8 @@ module Vmpooler vm_name = vm_ip unless vm_ip.nil? end open_socket(vm_name, domain_set) - rescue StandardError => _e + rescue StandardError => e + @logger.log('s', "[!] [#{pool_name}] '#{vm_name}' instance cannot be reached by vmpooler on tcp port 22; #{e}") return false end true From cab859272e3b7f25c05fb3e1de29117a38aaed94 Mon Sep 17 00:00:00 2001 From: Samuel Beaulieu Date: Fri, 22 Jul 2022 15:42:10 -0500 Subject: [PATCH 08/11] adding debug --- lib/vmpooler/providers/ec2.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/vmpooler/providers/ec2.rb b/lib/vmpooler/providers/ec2.rb index e084994..ff64335 100644 --- a/lib/vmpooler/providers/ec2.rb +++ b/lib/vmpooler/providers/ec2.rb @@ -391,9 +391,10 @@ module Vmpooler # TODO: we could use a healthcheck resource attached to instance domain_set = domain || global_config[:config]['domain'] if domain_set.nil? - vm_ip = get_vm(pool_name, vm_name)['private_ip_address'] + vm_ip = get_vm(pool_name, vm_name)['private_dns_name'] vm_name = vm_ip unless vm_ip.nil? end + @logger.log('s', "[>] [#{pool_name}] '#{vm_name}' vm_ready? #{domain_set} #{vm_ip}") open_socket(vm_name, domain_set) rescue StandardError => e @logger.log('s', "[!] [#{pool_name}] '#{vm_name}' instance cannot be reached by vmpooler on tcp port 22; #{e}") From 61c71d6e4e9b3f36edc2fac02dec84242e82aafc Mon Sep 17 00:00:00 2001 From: Samuel Beaulieu Date: Fri, 22 Jul 2022 15:47:13 -0500 Subject: [PATCH 09/11] fix vm_ready to check only the provider domain key, which will be used in the future for cloud dns --- lib/vmpooler/providers/ec2.rb | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/vmpooler/providers/ec2.rb b/lib/vmpooler/providers/ec2.rb index ff64335..d38c3c7 100644 --- a/lib/vmpooler/providers/ec2.rb +++ b/lib/vmpooler/providers/ec2.rb @@ -389,12 +389,11 @@ module Vmpooler def vm_ready?(pool_name, vm_name) begin # TODO: we could use a healthcheck resource attached to instance - domain_set = domain || global_config[:config]['domain'] + domain_set = domain if domain_set.nil? vm_ip = get_vm(pool_name, vm_name)['private_dns_name'] vm_name = vm_ip unless vm_ip.nil? end - @logger.log('s', "[>] [#{pool_name}] '#{vm_name}' vm_ready? #{domain_set} #{vm_ip}") open_socket(vm_name, domain_set) rescue StandardError => e @logger.log('s', "[!] [#{pool_name}] '#{vm_name}' instance cannot be reached by vmpooler on tcp port 22; #{e}") From a974045a627bec710bd593d92ddd753c69ca832f Mon Sep 17 00:00:00 2001 From: Samuel Beaulieu Date: Tue, 26 Jul 2022 09:41:12 -0500 Subject: [PATCH 10/11] set name to spicy proton, fix lifetime --- lib/vmpooler/providers/ec2.rb | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/lib/vmpooler/providers/ec2.rb b/lib/vmpooler/providers/ec2.rb index d38c3c7..ce48f57 100644 --- a/lib/vmpooler/providers/ec2.rb +++ b/lib/vmpooler/providers/ec2.rb @@ -205,11 +205,11 @@ module Vmpooler value: pool_name }, { - key: 'lifetime', + key: 'lifetime', # required by AWS reaper value: get_current_lifetime(new_vmname) }, { - key: 'created_by', + key: 'created_by', # required by AWS reaper value: get_current_user(new_vmname) }, { @@ -217,22 +217,20 @@ module Vmpooler value: get_current_job_url(new_vmname) }, { - key: 'organization', + key: 'organization', # required by AWS reaper value: 'engineering' }, { - key: 'portfolio', + key: 'portfolio', # required by AWS reaper value: 'ds-ci' + }, + { + key: 'Name', + value: new_vmname } ] } ] - if global_config[:config] && global_config[:config]['site_name'] - tag.first[:tags] << { - key: 'Name', - value: global_config[:config]['site_name'] - } - end config = { min_count: 1, max_count: 1, @@ -447,10 +445,11 @@ module Vmpooler end end + # returns lifetime in hours in the format Xh defaults to 1h def get_current_lifetime(vm_name) @redis.with_metrics do |redis| - lifetime = redis.hget("vmpooler__vm__#{vm_name}", 'lifetime') || '1h' - return lifetime + lifetime = redis.hget("vmpooler__vm__#{vm_name}", 'lifetime') || '1' + return "#{lifetime}h" end end From c6f7991aacb23a895fbb1d7df501d1598aad6787 Mon Sep 17 00:00:00 2001 From: Samuel Beaulieu Date: Tue, 26 Jul 2022 09:57:40 -0500 Subject: [PATCH 11/11] use jruby 9.3.6 --- .github/workflows/release.yml | 4 ++-- .github/workflows/testing.yml | 5 ++--- lib/vmpooler/aws_setup.rb | 11 +++++------ lib/vmpooler/providers/ec2.rb | 2 -- vmpooler-provider-ec2.gemspec | 2 +- 5 files changed, 10 insertions(+), 14 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 17ec95b..e0538bf 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -20,10 +20,10 @@ jobs: draft: false prerelease: false generateReleaseNotes: true - - name: Install Ruby 2.5.8 + - name: Install Ruby jruby-9.3.6.0 uses: ruby/setup-ruby@v1 with: - ruby-version: '2.5.8' + ruby-version: 'jruby-9.3.6.0' - name: Build gem run: gem build *.gemspec - name: Publish gem diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index 501403f..068495e 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -18,7 +18,7 @@ jobs: strategy: matrix: ruby-version: - - '2.5.8' + - 'jruby-9.3.6.0' steps: - uses: actions/checkout@v2 - name: Set up Ruby @@ -34,8 +34,7 @@ jobs: strategy: matrix: ruby-version: - - '2.5.8' - - 'jruby-9.2.12.0' + - 'jruby-9.3.6.0' steps: - uses: actions/checkout@v2 - name: Set up Ruby diff --git a/lib/vmpooler/aws_setup.rb b/lib/vmpooler/aws_setup.rb index 9b537f9..8077faf 100644 --- a/lib/vmpooler/aws_setup.rb +++ b/lib/vmpooler/aws_setup.rb @@ -18,11 +18,11 @@ module Vmpooler def setup_node_by_ssh(host, platform) conn = check_ssh_accepting_connections(host, platform) - if conn - @logger.log('s', "[>] [#{platform}] '#{@vm_name}' net:ssh connected") - configure_host(host, platform, conn) - @logger.log('s', "[>] [#{platform}] '#{@vm_name}' configured") - end + return unless conn + + @logger.log('s', "[>] [#{platform}] '#{@vm_name}' net:ssh connected") + configure_host(host, platform, conn) + @logger.log('s', "[>] [#{platform}] '#{@vm_name}' configured") end # For an Amazon Linux AMI, the user name is ec2-user. @@ -68,7 +68,6 @@ module Vmpooler rescue StandardError => e @logger.log('s', "[>] [#{platform}] '#{@vm_name}' net:ssh other error, skipping aws_setup: #{e}") puts e.backtrace - return nil end end diff --git a/lib/vmpooler/providers/ec2.rb b/lib/vmpooler/providers/ec2.rb index ce48f57..7ef5e36 100644 --- a/lib/vmpooler/providers/ec2.rb +++ b/lib/vmpooler/providers/ec2.rb @@ -254,8 +254,6 @@ module Vmpooler @logger.log('s', "[>] [#{pool_name}] '#{new_vmname}' instance running") ### System status checks # This check verifies that your instance is reachable. Amazon EC2 tests that network packets can get to your instance. - # If this check fails, there might be an issue with the infrastructure that is hosting your instance (such as AWS power, networking, or software systems). You can restart or replace the instance, wait for Amazon EC2’s systems to resolve the issue, or seek technical support. - # This check does not validate that your operating system and applications are accepting traffic. ### Instance status checks # This check verifies that your instance's operating system is accepting traffic. connection.client.wait_until(:instance_status_ok, { instance_ids: [instance_id] }) diff --git a/vmpooler-provider-ec2.gemspec b/vmpooler-provider-ec2.gemspec index a569143..dd6986a 100644 --- a/vmpooler-provider-ec2.gemspec +++ b/vmpooler-provider-ec2.gemspec @@ -26,7 +26,7 @@ Gem::Specification.new do |s| s.add_development_dependency 'pry' s.add_development_dependency 'rack-test', '>= 0.6' s.add_development_dependency 'rspec', '>= 3.2' - s.add_development_dependency 'rubocop', '~> 1.1.0' + s.add_development_dependency 'rubocop', '~> 1.28.2' s.add_development_dependency 'simplecov', '>= 0.11.2' s.add_development_dependency 'thor', '~> 1.0', '>= 1.0.1' s.add_development_dependency 'yarjuf', '>= 2.0'