diff options
Diffstat (limited to 'scripts/build-remote.pl.in')
-rwxr-xr-x | scripts/build-remote.pl.in | 325 |
1 files changed, 177 insertions, 148 deletions
diff --git a/scripts/build-remote.pl.in b/scripts/build-remote.pl.in index c440b6a0f142..e943b0d9e304 100755 --- a/scripts/build-remote.pl.in +++ b/scripts/build-remote.pl.in @@ -3,7 +3,8 @@ use Fcntl ':flock'; use English '-no_match_vars'; use IO::Handle; -use ssh qw/sshOpts openSSHConnection/; +use SSH qw/sshOpts openSSHConnection/; +no warnings('once'); # General operation: @@ -31,57 +32,22 @@ $ENV{"DISPLAY"} = ""; $ENV{"SSH_ASKPASS"} = ""; -my $loadIncreased = 0; - -my ($amWilling, $localSystem, $neededSystem, $drvPath, $maxSilentTime) = @ARGV; -$maxSilentTime = 0 unless defined $maxSilentTime; - sub sendReply { my $reply = shift; print STDERR "# $reply\n"; } -sub decline { - sendReply "decline"; - exit 0; -} - -my $currentLoad = $ENV{"NIX_CURRENT_LOAD"}; -decline unless defined $currentLoad; -mkdir $currentLoad, 0777 or die unless -d $currentLoad; - -my $conf = $ENV{"NIX_REMOTE_SYSTEMS"}; -decline if !defined $conf || ! -e $conf; - -my $canBuildLocally = $amWilling && ($localSystem eq $neededSystem); - +sub all { $_ || return 0 for @_; 1 } -# Read the list of machines. -my @machines; -open CONF, "< $conf" or die; - -while (<CONF>) { - chomp; - s/\#.*$//g; - next if /^\s*$/; - /^\s*(\S+)\s+(\S+)\s+(\S+)\s+(\d+)(\s+([0-9\.]+))?\s*$/ or die; - push @machines, - { hostName => $1 - , systemTypes => [split(/,/, $2)] - , sshKeys => $3 - , maxJobs => $4 - , speedFactor => 1.0 * ($6 || 1) - , enabled => 1 - }; -} -close CONF; +# Initialisation. +my $loadIncreased = 0; +my ($localSystem, $maxSilentTime, $printBuildTrace) = @ARGV; +$maxSilentTime = 0 unless defined $maxSilentTime; -# Acquire the exclusive lock on $currentLoad/main-lock. -my $mainLock = "$currentLoad/main-lock"; -open MAINLOCK, ">>$mainLock" or die; -flock(MAINLOCK, LOCK_EX) or die; +my $currentLoad = $ENV{"NIX_CURRENT_LOAD"}; +my $conf = $ENV{"NIX_REMOTE_SYSTEMS"}; sub openSlotLock { @@ -91,150 +57,213 @@ sub openSlotLock { open $slotLock, ">>$slotLockFn" or die; return $slotLock; } - -my $hostName; -my $slotLock; -while (1) { +# Read the list of machines. +my @machines; +if (defined $conf && -e $conf) { + open CONF, "< $conf" or die; + while (<CONF>) { + chomp; + s/\#.*$//g; + next if /^\s*$/; + my @tokens = split /\s/, $_; + push @machines, + { hostName => $tokens[0] + , systemTypes => [ split(/,/, $tokens[1]) ] + , sshKeys => $tokens[2] + , maxJobs => int($tokens[3]) + , speedFactor => 1.0 * (defined $tokens[4] ? int($tokens[4]) : 1) + , features => [ split(/,/, $tokens[5] || "") ] + , enabled => 1 + }; + } + close CONF; +} + + + +# Wait for the calling process to ask us whether we can build some derivation. +my ($drvPath, $hostName, $slotLock); + +REQ: while (1) { + $_ = <STDIN> || exit 0; + my ($amWilling, $neededSystem); + ($amWilling, $neededSystem, $drvPath, $requiredFeatures) = split; + my @requiredFeatures = split /,/, $requiredFeatures; + + my $canBuildLocally = $amWilling && ($localSystem eq $neededSystem); + + if (!defined $currentLoad) { + sendReply "decline"; + next; + } + + # Acquire the exclusive lock on $currentLoad/main-lock. + mkdir $currentLoad, 0777 or die unless -d $currentLoad; + my $mainLock = "$currentLoad/main-lock"; + open MAINLOCK, ">>$mainLock" or die; + flock(MAINLOCK, LOCK_EX) or die; - # Find all machine that can execute this build, i.e., that support - # builds for the given platform and are not at their job limit. - my $rightType = 0; - my @available = (); - LOOP: foreach my $cur (@machines) { - if ($cur->{enabled} && grep { $neededSystem eq $_ } @{$cur->{systemTypes}}) { - $rightType = 1; - - # We have a machine of the right type. Determine the load on - # the machine. - my $slot = 0; - my $load = 0; - my $free; - while ($slot < $cur->{maxJobs}) { - my $slotLock = openSlotLock($cur, $slot); - if (flock($slotLock, LOCK_EX | LOCK_NB)) { - $free = $slot unless defined $free; - flock($slotLock, LOCK_UN) or die; - } else { - $load++; + + while (1) { + # Find all machine that can execute this build, i.e., that + # support builds for the given platform and features, and are + # not at their job limit. + my $rightType = 0; + my @available = (); + LOOP: foreach my $cur (@machines) { + if ($cur->{enabled} + && (grep { $neededSystem eq $_ } @{$cur->{systemTypes}}) + && all(map { my $f = $_; 0 != grep { $f eq $_ } @{$cur->{features}} } @requiredFeatures)) + { + $rightType = 1; + + # We have a machine of the right type. Determine the load on + # the machine. + my $slot = 0; + my $load = 0; + my $free; + while ($slot < $cur->{maxJobs}) { + my $slotLock = openSlotLock($cur, $slot); + if (flock($slotLock, LOCK_EX | LOCK_NB)) { + $free = $slot unless defined $free; + flock($slotLock, LOCK_UN) or die; + } else { + $load++; + } + close $slotLock; + $slot++; } - close $slotLock; - $slot++; + + push @available, { machine => $cur, load => $load, free => $free } + if $load < $cur->{maxJobs}; } - - push @available, { machine => $cur, load => $load, free => $free } - if $load < $cur->{maxJobs}; } - } - if (defined $ENV{NIX_DEBUG_HOOK}) { - print STDERR "load on " . $_->{machine}->{hostName} . " = " . $_->{load} . "\n" - foreach @available; - } + if (defined $ENV{NIX_DEBUG_HOOK}) { + print STDERR "load on " . $_->{machine}->{hostName} . " = " . $_->{load} . "\n" + foreach @available; + } - # Didn't find any available machine? Then decline or postpone. - if (scalar @available == 0) { - # Postpone if we have a machine of the right type, except if the - # local system can and wants to do the build. - if ($rightType && !$canBuildLocally) { - sendReply "postpone"; - exit 0; - } else { - decline; + # Didn't find any available machine? Then decline or postpone. + if (scalar @available == 0) { + # Postpone if we have a machine of the right type, except + # if the local system can and wants to do the build. + if ($rightType && !$canBuildLocally) { + sendReply "postpone"; + } else { + sendReply "decline"; + } + close MAINLOCK; + next REQ; } - } - - # Prioritise the available machines as follows: - # - First by load divided by speed factor, rounded to the nearest - # integer. This causes fast machines to be preferred over slow - # machines with similar loads. - # - Then by speed factor. - # - Finally by load. - sub lf { my $x = shift; return int($x->{load} / $x->{machine}->{speedFactor} + 0.4999); } - @available = sort - { lf($a) <=> lf($b) - || $b->{machine}->{speedFactor} <=> $a->{machine}->{speedFactor} - || $a->{load} <=> $b->{load} - } @available; + # Prioritise the available machines as follows: + # - First by load divided by speed factor, rounded to the nearest + # integer. This causes fast machines to be preferred over slow + # machines with similar loads. + # - Then by speed factor. + # - Finally by load. + sub lf { my $x = shift; return int($x->{load} / $x->{machine}->{speedFactor} + 0.4999); } + @available = sort + { lf($a) <=> lf($b) + || $b->{machine}->{speedFactor} <=> $a->{machine}->{speedFactor} + || $a->{load} <=> $b->{load} + } @available; - # Select the best available machine and lock a free slot. - my $selected = $available[0]; - my $machine = $selected->{machine}; - $slotLock = openSlotLock($machine, $selected->{free}); - flock($slotLock, LOCK_EX | LOCK_NB) or die; - utime undef, undef, $slotLock; + # Select the best available machine and lock a free slot. + my $selected = $available[0]; + my $machine = $selected->{machine}; + + $slotLock = openSlotLock($machine, $selected->{free}); + flock($slotLock, LOCK_EX | LOCK_NB) or die; + utime undef, undef, $slotLock; - close MAINLOCK; + close MAINLOCK; - # Connect to the selected machine. - @sshOpts = ("-i", $machine->{sshKeys}, "-x"); - $hostName = $machine->{hostName}; - last if openSSHConnection $hostName; + # Connect to the selected machine. + @sshOpts = ("-i", $machine->{sshKeys}, "-x"); + $hostName = $machine->{hostName}; + last REQ if openSSHConnection $hostName; - warn "unable to open SSH connection to $hostName, trying other available machines...\n"; - $machine->{enabled} = 0; + warn "unable to open SSH connection to $hostName, trying other available machines...\n"; + $machine->{enabled} = 0; + } } # Tell Nix we've accepted the build. sendReply "accept"; -my $x = <STDIN>; -chomp $x; - -if ($x ne "okay") { - exit 0; -} +my @inputs = split /\s/, readline(STDIN); +my @outputs = split /\s/, readline(STDIN); -# Do the actual build. print STDERR "building `$drvPath' on `$hostName'\n"; +print STDERR "@ build-remote $drvPath $hostName\n" if $printBuildTrace; -my $inputs = `cat inputs`; die if ($? != 0); -$inputs =~ s/\n/ /g; - -my $outputs = `cat outputs`; die if ($? != 0); -$outputs =~ s/\n/ /g; - -print "copying inputs...\n"; my $maybeSign = ""; $maybeSign = "--sign" if -e "/nix/etc/nix/signing-key.sec"; -system("NIX_SSHOPTS=\"@sshOpts\" @bindir@/nix-copy-closure $hostName $maybeSign $drvPath $inputs") == 0 + +# Register the derivation as a temporary GC root. Note that $PPID is +# the PID of the remote SSH process, which, due to the use of a +# persistant SSH connection, should be the same across all remote +# command invocations for this session. +my $rootsDir = "@localstatedir@/nix/gcroots/tmp"; +system("ssh $hostName @sshOpts 'mkdir -m 1777 -p $rootsDir; ln -sfn $drvPath $rootsDir/\$PPID.drv'"); + +sub removeRoots { + system("ssh $hostName @sshOpts 'rm -f $rootsDir/\$PPID.drv $rootsDir/\$PPID.out'"); +} + + +# Copy the derivation and its dependencies to the build machine. +system("NIX_SSHOPTS=\"@sshOpts\" @bindir@/nix-copy-closure $hostName $maybeSign $drvPath @inputs") == 0 or die "cannot copy inputs to $hostName: $?"; -print "building...\n"; - -my $buildFlags = "--max-silent-time $maxSilentTime --fallback"; - -# `-tt' forces allocation of a pseudo-terminal. This is required to -# make the remote nix-store process receive a signal when the -# connection dies. Without it, the remote process might continue to -# run indefinitely (that is, until it next tries to write to -# stdout/stderr). -if (system("ssh $hostName @sshOpts -tt 'nix-store -r $drvPath $buildFlags > /dev/null'") != 0) { - # If we couldn't run ssh or there was an ssh problem (indicated by - # exit code 255), then we return exit code 1; otherwise we assume - # that the builder failed, which we indicate to Nix using exit - # code 100. It's important to distinguish between the two because - # the first is a transient failure and the latter is permanent. - my $res = $? == -1 || ($? >> 8) == 255 ? 1 : 100; - print STDERR "build of `$drvPath' on `$hostName' failed with exit code $?\n"; + +# Perform the build. +my $buildFlags = "--max-silent-time $maxSilentTime --fallback --add-root $rootsDir/\$PPID.out --option verbosity 0"; + +# We let the remote side kill its process group when the connection is +# closed unexpectedly. This is necessary to ensure that no processes +# are left running on the remote system if the local Nix process is +# killed. (SSH itself doesn't kill child processes if the connection +# is interrupted unless the `-tt' flag is used to force a pseudo-tty, +# in which case every child receives SIGHUP; however, `-tt' doesn't +# work on some platforms when connection sharing is used.) +pipe STDIN, DUMMY; # make sure we have a readable STDIN +if (system("ssh $hostName @sshOpts '(read; kill -INT -\$\$) <&0 & nix-store -r $drvPath $buildFlags > /dev/null' 2>&4") != 0) { + # Note that if we get exit code 100 from `nix-store -r', it + # denotes a permanent build failure (as opposed to an SSH problem + # or a temporary Nix problem). We propagate this to the caller to + # allow it to distinguish between transient and permanent + # failures. + my $res = $? >> 8; + print STDERR "build of `$drvPath' on `$hostName' failed with exit code $res\n"; + removeRoots; exit $res; } -print "build of `$drvPath' on `$hostName' succeeded\n"; +#print "build of `$drvPath' on `$hostName' succeeded\n"; -foreach my $output (split '\n', $outputs) { + +# Copy the output from the build machine. +foreach my $output (@outputs) { my $maybeSignRemote = ""; $maybeSignRemote = "--sign" if $UID != 0; - system("ssh $hostName @sshOpts 'nix-store --export $maybeSignRemote $output' | @bindir@/nix-store --import > /dev/null") == 0 + system("ssh $hostName @sshOpts 'nix-store --export $maybeSignRemote $output'" . + "| NIX_HELD_LOCKS=$output @bindir@/nix-store --import > /dev/null") == 0 or die "cannot copy $output from $hostName: $?"; } + + +# Get rid of the temporary GC roots. +removeRoots; |