From d0c32dc135f147ad352e28ff8c648e611516edec Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Wed, 3 Feb 2010 20:35:37 +0000 Subject: * In the build hook, if connecting to a machine fails, try the other machines of the right type (if available). This makes the build farm more robust to failures. --- scripts/build-remote.pl.in | 143 ++++++++++++++++++++++---------------------- scripts/nix-copy-closure.in | 2 +- scripts/ssh.pm | 5 +- 3 files changed, 77 insertions(+), 73 deletions(-) diff --git a/scripts/build-remote.pl.in b/scripts/build-remote.pl.in index 3ba4a60fd567..da26b85969a8 100755 --- a/scripts/build-remote.pl.in +++ b/scripts/build-remote.pl.in @@ -71,6 +71,7 @@ while () { , sshKeys => $3 , maxJobs => $4 , speedFactor => 1.0 * ($6 || 1) + , enabled => 1 }; } @@ -92,89 +93,96 @@ sub openSlotLock { } -# Find all machine that can execute this build, i.e., that support -# builds for the given platform and are not at their job limit. -my $rightType = 0; -my @available = (); -LOOP: foreach my $cur (@machines) { - if (grep { $neededSystem eq $_ } @{$cur->{systemTypes}}) { - $rightType = 1; - - # We have a machine of the right type. Determine the load on - # the machine. - my $slot = 0; - my $load = 0; - my $free; - while ($slot < $cur->{maxJobs}) { - my $slotLock = openSlotLock($cur, $slot); - if (flock($slotLock, LOCK_EX | LOCK_NB)) { - $free = $slot unless defined $free; - flock($slotLock, LOCK_UN) or die; - } else { - $load++; +my $hostName; + +while (1) { + + # Find all machine that can execute this build, i.e., that support + # builds for the given platform and are not at their job limit. + my $rightType = 0; + my @available = (); + LOOP: foreach my $cur (@machines) { + if ($cur->{enabled} && grep { $neededSystem eq $_ } @{$cur->{systemTypes}}) { + $rightType = 1; + + # We have a machine of the right type. Determine the load on + # the machine. + my $slot = 0; + my $load = 0; + my $free; + while ($slot < $cur->{maxJobs}) { + my $slotLock = openSlotLock($cur, $slot); + if (flock($slotLock, LOCK_EX | LOCK_NB)) { + $free = $slot unless defined $free; + flock($slotLock, LOCK_UN) or die; + } else { + $load++; + } + close $slotLock; + $slot++; } - close $slotLock; - $slot++; - } - push @available, { machine => $cur, load => $load, free => $free } + push @available, { machine => $cur, load => $load, free => $free } if $load < $cur->{maxJobs}; + } } -} -if (defined $ENV{NIX_DEBUG_HOOK}) { - print STDERR "load on " . $_->{machine}->{hostName} . " = " . $_->{load} . "\n" - foreach @available; -} + if (defined $ENV{NIX_DEBUG_HOOK}) { + print STDERR "load on " . $_->{machine}->{hostName} . " = " . $_->{load} . "\n" + foreach @available; + } -# Didn't find any available machine? Then decline or postpone. -if (scalar @available == 0) { - # Postpone if we have a machine of the right type, except if the - # local system can and wants to do the build. - if ($rightType && !$canBuildLocally) { - sendReply "postpone"; - exit 0; - } else { - decline; + # Didn't find any available machine? Then decline or postpone. + if (scalar @available == 0) { + # Postpone if we have a machine of the right type, except if the + # local system can and wants to do the build. + if ($rightType && !$canBuildLocally) { + sendReply "postpone"; + exit 0; + } else { + decline; + } } -} -# Prioritise the available machines as follows: -# - First by load divided by speed factor, rounded to the nearest -# integer. This causes fast machines to be preferred over slow -# machines with similar loads. -# - Then by speed factor. -# - Finally by load. -sub lf { my $x = shift; return int($x->{load} / $x->{machine}->{speedFactor} + 0.4999); } -@available = sort - { lf($a) <=> lf($b) - || $b->{machine}->{speedFactor} <=> $a->{machine}->{speedFactor} - || $a->{load} <=> $b->{load} - } @available; + # Prioritise the available machines as follows: + # - First by load divided by speed factor, rounded to the nearest + # integer. This causes fast machines to be preferred over slow + # machines with similar loads. + # - Then by speed factor. + # - Finally by load. + sub lf { my $x = shift; return int($x->{load} / $x->{machine}->{speedFactor} + 0.4999); } + @available = sort + { lf($a) <=> lf($b) + || $b->{machine}->{speedFactor} <=> $a->{machine}->{speedFactor} + || $a->{load} <=> $b->{load} + } @available; -# Select the best available machine and lock a free slot. -my $selected = $available[0]; -my $machine = $selected->{machine}; + # Select the best available machine and lock a free slot. + my $selected = $available[0]; + my $machine = $selected->{machine}; -my $slotLock = openSlotLock($machine, $selected->{free}); -flock($slotLock, LOCK_EX | LOCK_NB) or die; -utime undef, undef, $slotLock; + my $slotLock = openSlotLock($machine, $selected->{free}); + flock($slotLock, LOCK_EX | LOCK_NB) or die; + utime undef, undef, $slotLock; -close MAINLOCK; + close MAINLOCK; + + + # Connect to the selected machine. + @sshOpts = ("-i", $machine->{sshKeys}, "-x"); + $hostName = $machine->{hostName}; + last if openSSHConnection $hostName; + + warn "unable to open SSH connection to $hostName, trying other available machines...\n"; + $machine->{enabled} = 0; +} # Tell Nix we've accepted the build. sendReply "accept"; -if (defined $ENV{NIX_DEBUG_HOOK}) { - my $hostName = $machine->{hostName}; - my $sp = $machine->{speedFactor}; - print STDERR "building `$drvPath' on `$hostName' - $sp - " . $selected->{free} . "\n"; - sleep 10; - exit 0; -} my $x = ; chomp $x; @@ -184,13 +192,8 @@ if ($x ne "okay") { # Do the actual build. -my $hostName = $machine->{hostName}; print STDERR "building `$drvPath' on `$hostName'\n"; -push @sshOpts, "-i", $machine->{sshKeys}, "-x"; - -openSSHConnection $hostName; - my $inputs = `cat inputs`; die if ($? != 0); $inputs =~ s/\n/ /g; diff --git a/scripts/nix-copy-closure.in b/scripts/nix-copy-closure.in index 313d6f0192e6..59046814b5b5 100644 --- a/scripts/nix-copy-closure.in +++ b/scripts/nix-copy-closure.in @@ -53,7 +53,7 @@ while (@ARGV) { } -openSSHConnection $sshHost; +openSSHConnection $sshHost or die "$0: unable to start SSH\n"; if ($toMode) { # Copy TO the remote machine. diff --git a/scripts/ssh.pm b/scripts/ssh.pm index 0295cef33b0a..cea486675ece 100644 --- a/scripts/ssh.pm +++ b/scripts/ssh.pm @@ -12,15 +12,16 @@ sub openSSHConnection { my ($host) = @_; die if $sshStarted; $sshHost = $host; - return if system("ssh $sshHost @sshOpts -O check 2> /dev/null") == 0; + return 1 if system("ssh $sshHost @sshOpts -O check 2> /dev/null") == 0; my $tmpDir = tempdir("nix-ssh.XXXXXX", CLEANUP => 1, TMPDIR => 1) or die "cannot create a temporary directory"; push @sshOpts, "-S", "$tmpDir/control"; system("ssh $sshHost @sshOpts -M -N -f") == 0 - or die "unable to start SSH: $?"; + or return 0; $sshStarted = 1; + return 1; } # Tell the master SSH client to exit. -- cgit 1.4.1