about summary refs log tree commit diff
diff options
context:
space:
mode:
authorEelco Dolstra <e.dolstra@tudelft.nl>2010-02-03T20·35+0000
committerEelco Dolstra <e.dolstra@tudelft.nl>2010-02-03T20·35+0000
commitd0c32dc135f147ad352e28ff8c648e611516edec (patch)
treec474e310974cb3f9aa9acb0be1625c7d1cd50799
parentf56a039775930d4ba2b4504440b7ab37dfefeb75 (diff)
* In the build hook, if connecting to a machine fails, try the other
  machines of the right type (if available).  This makes the build
  farm more robust to failures. 

-rwxr-xr-xscripts/build-remote.pl.in143
-rw-r--r--scripts/nix-copy-closure.in2
-rw-r--r--scripts/ssh.pm5
3 files changed, 77 insertions, 73 deletions
diff --git a/scripts/build-remote.pl.in b/scripts/build-remote.pl.in
index 3ba4a60fd567..da26b85969a8 100755
--- a/scripts/build-remote.pl.in
+++ b/scripts/build-remote.pl.in
@@ -71,6 +71,7 @@ while (<CONF>) {
         , sshKeys => $3
         , maxJobs => $4
         , speedFactor => 1.0 * ($6 || 1)
+        , enabled => 1
         };
 }
 
@@ -92,89 +93,96 @@ sub openSlotLock {
 }
     
 
-# Find all machine that can execute this build, i.e., that support
-# builds for the given platform and are not at their job limit.
-my $rightType = 0;
-my @available = ();
-LOOP: foreach my $cur (@machines) {
-    if (grep { $neededSystem eq $_ } @{$cur->{systemTypes}}) {
-        $rightType = 1;
-
-        # We have a machine of the right type.  Determine the load on
-        # the machine.
-        my $slot = 0;
-        my $load = 0;
-        my $free;
-        while ($slot < $cur->{maxJobs}) {
-            my $slotLock = openSlotLock($cur, $slot);
-            if (flock($slotLock, LOCK_EX | LOCK_NB)) {
-                $free = $slot unless defined $free;
-                flock($slotLock, LOCK_UN) or die;
-            } else {
-                $load++;
+my $hostName;
+
+while (1) {
+    
+    # Find all machine that can execute this build, i.e., that support
+    # builds for the given platform and are not at their job limit.
+    my $rightType = 0;
+    my @available = ();
+    LOOP: foreach my $cur (@machines) {
+        if ($cur->{enabled} && grep { $neededSystem eq $_ } @{$cur->{systemTypes}}) {
+            $rightType = 1;
+
+            # We have a machine of the right type.  Determine the load on
+            # the machine.
+            my $slot = 0;
+            my $load = 0;
+            my $free;
+            while ($slot < $cur->{maxJobs}) {
+                my $slotLock = openSlotLock($cur, $slot);
+                if (flock($slotLock, LOCK_EX | LOCK_NB)) {
+                    $free = $slot unless defined $free;
+                    flock($slotLock, LOCK_UN) or die;
+                } else {
+                    $load++;
+                }
+                close $slotLock;
+                $slot++;
             }
-            close $slotLock;
-            $slot++;
-        }
 
-        push @available, { machine => $cur, load => $load, free => $free }
+            push @available, { machine => $cur, load => $load, free => $free }
             if $load < $cur->{maxJobs};
+        }
     }
-}
 
-if (defined $ENV{NIX_DEBUG_HOOK}) {
-    print STDERR "load on " . $_->{machine}->{hostName} . " = " . $_->{load} . "\n"
-        foreach @available;
-}
+    if (defined $ENV{NIX_DEBUG_HOOK}) {
+        print STDERR "load on " . $_->{machine}->{hostName} . " = " . $_->{load} . "\n"
+            foreach @available;
+    }
 
 
-# Didn't find any available machine?  Then decline or postpone.
-if (scalar @available == 0) {
-    # Postpone if we have a machine of the right type, except if the
-    # local system can and wants to do the build.
-    if ($rightType && !$canBuildLocally) {
-        sendReply "postpone";
-        exit 0;
-    } else {
-        decline;
+    # Didn't find any available machine?  Then decline or postpone.
+    if (scalar @available == 0) {
+        # Postpone if we have a machine of the right type, except if the
+        # local system can and wants to do the build.
+        if ($rightType && !$canBuildLocally) {
+            sendReply "postpone";
+            exit 0;
+        } else {
+            decline;
+        }
     }
-}
 
 
-# Prioritise the available machines as follows:
-# - First by load divided by speed factor, rounded to the nearest
-#   integer.  This causes fast machines to be preferred over slow
-#   machines with similar loads.
-# - Then by speed factor.
-# - Finally by load.
-sub lf { my $x = shift; return int($x->{load} / $x->{machine}->{speedFactor} + 0.4999); }
-@available = sort
-    { lf($a) <=> lf($b)
-          || $b->{machine}->{speedFactor} <=> $a->{machine}->{speedFactor}
-          || $a->{load} <=> $b->{load}
-    } @available;
+    # Prioritise the available machines as follows:
+    # - First by load divided by speed factor, rounded to the nearest
+    #   integer.  This causes fast machines to be preferred over slow
+    #   machines with similar loads.
+    # - Then by speed factor.
+    # - Finally by load.
+    sub lf { my $x = shift; return int($x->{load} / $x->{machine}->{speedFactor} + 0.4999); }
+    @available = sort
+        { lf($a) <=> lf($b)
+              || $b->{machine}->{speedFactor} <=> $a->{machine}->{speedFactor}
+              || $a->{load} <=> $b->{load}
+        } @available;
 
 
-# Select the best available machine and lock a free slot.
-my $selected = $available[0]; 
-my $machine = $selected->{machine};
+    # Select the best available machine and lock a free slot.
+    my $selected = $available[0]; 
+    my $machine = $selected->{machine};
 
-my $slotLock = openSlotLock($machine, $selected->{free});
-flock($slotLock, LOCK_EX | LOCK_NB) or die;
-utime undef, undef, $slotLock;
+    my $slotLock = openSlotLock($machine, $selected->{free});
+    flock($slotLock, LOCK_EX | LOCK_NB) or die;
+    utime undef, undef, $slotLock;
 
-close MAINLOCK;
+    close MAINLOCK;
+
+
+    # Connect to the selected machine.
+    @sshOpts = ("-i", $machine->{sshKeys}, "-x");
+    $hostName = $machine->{hostName};
+    last if openSSHConnection $hostName;
+    
+    warn "unable to open SSH connection to $hostName, trying other available machines...\n";
+    $machine->{enabled} = 0;
+}
 
 
 # Tell Nix we've accepted the build.
 sendReply "accept";
-if (defined $ENV{NIX_DEBUG_HOOK}) {
-    my $hostName = $machine->{hostName};
-    my $sp = $machine->{speedFactor};
-    print STDERR "building `$drvPath' on `$hostName' - $sp - " . $selected->{free} . "\n";
-    sleep 10;
-    exit 0;
-}
 my $x = <STDIN>;
 chomp $x;
 
@@ -184,13 +192,8 @@ if ($x ne "okay") {
 
 
 # Do the actual build.
-my $hostName = $machine->{hostName};
 print STDERR "building `$drvPath' on `$hostName'\n";
 
-push @sshOpts, "-i", $machine->{sshKeys}, "-x";
-
-openSSHConnection $hostName;
-
 my $inputs = `cat inputs`; die if ($? != 0);
 $inputs =~ s/\n/ /g;
 
diff --git a/scripts/nix-copy-closure.in b/scripts/nix-copy-closure.in
index 313d6f0192e6..59046814b5b5 100644
--- a/scripts/nix-copy-closure.in
+++ b/scripts/nix-copy-closure.in
@@ -53,7 +53,7 @@ while (@ARGV) {
 }
 
 
-openSSHConnection $sshHost;
+openSSHConnection $sshHost or die "$0: unable to start SSH\n";
 
 
 if ($toMode) { # Copy TO the remote machine.
diff --git a/scripts/ssh.pm b/scripts/ssh.pm
index 0295cef33b0a..cea486675ece 100644
--- a/scripts/ssh.pm
+++ b/scripts/ssh.pm
@@ -12,15 +12,16 @@ sub openSSHConnection {
     my ($host) = @_;
     die if $sshStarted;
     $sshHost = $host;
-    return if system("ssh $sshHost @sshOpts -O check 2> /dev/null") == 0;
+    return 1 if system("ssh $sshHost @sshOpts -O check 2> /dev/null") == 0;
 
     my $tmpDir = tempdir("nix-ssh.XXXXXX", CLEANUP => 1, TMPDIR => 1)
         or die "cannot create a temporary directory";
     
     push @sshOpts, "-S", "$tmpDir/control";
     system("ssh $sshHost @sshOpts -M -N -f") == 0
-        or die "unable to start SSH: $?";
+        or return 0;
     $sshStarted = 1;
+    return 1;
 }
 
 # Tell the master SSH client to exit.