diff options
Diffstat (limited to 'scripts/build-remote.pl.in')
-rwxr-xr-x | scripts/build-remote.pl.in | 302 |
1 files changed, 302 insertions, 0 deletions
diff --git a/scripts/build-remote.pl.in b/scripts/build-remote.pl.in new file mode 100755 index 000000000000..6dfa16d5cbda --- /dev/null +++ b/scripts/build-remote.pl.in @@ -0,0 +1,302 @@ +#! @perl@ -w @perlFlags@ + +use Fcntl qw(:DEFAULT :flock); +use English '-no_match_vars'; +use IO::Handle; +use Nix::Config; +use Nix::SSH qw/sshOpts openSSHConnection/; +use Nix::CopyClosure; +use Nix::Store; +no warnings('once'); + + +# General operation: +# +# Try to find a free machine of type $neededSystem. We do this as +# follows: +# - We acquire an exclusive lock on $currentLoad/main-lock. +# - For each machine $machine of type $neededSystem and for each $slot +# less than the maximum load for that machine, we try to get an +# exclusive lock on $currentLoad/$machine-$slot (without blocking). +# If we get such a lock, we send "accept" to the caller. Otherwise, +# we send "postpone" and exit. +# - We release the exclusive lock on $currentLoad/main-lock. +# - We perform the build on $neededSystem. +# - We release the exclusive lock on $currentLoad/$machine-$slot. +# +# The nice thing about this scheme is that if we die prematurely, the +# locks are released automatically. + + +# Make sure that we don't get any SSH passphrase or host key popups - +# if there is any problem it should fail, not do something +# interactive. +$ENV{"DISPLAY"} = ""; +$ENV{"SSH_ASKPASS"} = ""; + + +sub sendReply { + my $reply = shift; + print STDERR "# $reply\n"; +} + +sub all { $_ || return 0 for @_; 1 } + + +# Initialisation. +my $loadIncreased = 0; + +my ($localSystem, $maxSilentTime, $printBuildTrace, $buildTimeout) = @ARGV; + +my $currentLoad = $ENV{"NIX_CURRENT_LOAD"}; +my $conf = $ENV{"NIX_REMOTE_SYSTEMS"}; + + +sub openSlotLock { + my ($machine, $slot) = @_; + my $slotLockFn = "$currentLoad/" . (join '+', @{$machine->{systemTypes}}) . "-" . $machine->{hostName} . "-$slot"; + my $slotLock = new IO::Handle; + sysopen $slotLock, "$slotLockFn", O_RDWR|O_CREAT, 0600 or die; + return $slotLock; +} + + +# Read the list of machines. +my @machines; +if (defined $conf && -e $conf) { + open CONF, "<$conf" or die; + while (<CONF>) { + chomp; + s/\#.*$//g; + next if /^\s*$/; + my @tokens = split /\s/, $_; + my @supportedFeatures = split(/,/, $tokens[5] || ""); + my @mandatoryFeatures = split(/,/, $tokens[6] || ""); + push @machines, + { hostName => $tokens[0] + , systemTypes => [ split(/,/, $tokens[1]) ] + , sshKeys => $tokens[2] + , maxJobs => int($tokens[3]) + , speedFactor => 1.0 * (defined $tokens[4] ? int($tokens[4]) : 1) + , supportedFeatures => [ @supportedFeatures, @mandatoryFeatures ] + , mandatoryFeatures => [ @mandatoryFeatures ] + , enabled => 1 + }; + } + close CONF; +} + + + +# Wait for the calling process to ask us whether we can build some derivation. +my ($drvPath, $hostName, $slotLock); + +REQ: while (1) { + $_ = <STDIN> || exit 0; + (my $amWilling, my $neededSystem, $drvPath, my $requiredFeatures) = split; + my @requiredFeatures = split /,/, $requiredFeatures; + + my $canBuildLocally = $amWilling && ($localSystem eq $neededSystem); + + if (!defined $currentLoad) { + sendReply "decline"; + next; + } + + # Acquire the exclusive lock on $currentLoad/main-lock. + mkdir $currentLoad, 0777 or die unless -d $currentLoad; + my $mainLock = "$currentLoad/main-lock"; + sysopen MAINLOCK, "$mainLock", O_RDWR|O_CREAT, 0600 or die; + flock(MAINLOCK, LOCK_EX) or die; + + + while (1) { + # Find all machine that can execute this build, i.e., that + # support builds for the given platform and features, and are + # not at their job limit. + my $rightType = 0; + my @available = (); + LOOP: foreach my $cur (@machines) { + if ($cur->{enabled} + && (grep { $neededSystem eq $_ } @{$cur->{systemTypes}}) + && all(map { my $f = $_; 0 != grep { $f eq $_ } @{$cur->{supportedFeatures}} } (@requiredFeatures, @mandatoryFeatures)) + && all(map { my $f = $_; 0 != grep { $f eq $_ } @requiredFeatures } @{$cur->{mandatoryFeatures}}) + ) + { + $rightType = 1; + + # We have a machine of the right type. Determine the load on + # the machine. + my $slot = 0; + my $load = 0; + my $free; + while ($slot < $cur->{maxJobs}) { + my $slotLock = openSlotLock($cur, $slot); + if (flock($slotLock, LOCK_EX | LOCK_NB)) { + $free = $slot unless defined $free; + flock($slotLock, LOCK_UN) or die; + } else { + $load++; + } + close $slotLock; + $slot++; + } + + push @available, { machine => $cur, load => $load, free => $free } + if $load < $cur->{maxJobs}; + } + } + + if (defined $ENV{NIX_DEBUG_HOOK}) { + print STDERR "load on " . $_->{machine}->{hostName} . " = " . $_->{load} . "\n" + foreach @available; + } + + + # Didn't find any available machine? Then decline or postpone. + if (scalar @available == 0) { + # Postpone if we have a machine of the right type, except + # if the local system can and wants to do the build. + if ($rightType && !$canBuildLocally) { + sendReply "postpone"; + } else { + sendReply "decline"; + } + close MAINLOCK; + next REQ; + } + + + # Prioritise the available machines as follows: + # - First by load divided by speed factor, rounded to the nearest + # integer. This causes fast machines to be preferred over slow + # machines with similar loads. + # - Then by speed factor. + # - Finally by load. + sub lf { my $x = shift; return int($x->{load} / $x->{machine}->{speedFactor} + 0.4999); } + @available = sort + { lf($a) <=> lf($b) + || $b->{machine}->{speedFactor} <=> $a->{machine}->{speedFactor} + || $a->{load} <=> $b->{load} + } @available; + + + # Select the best available machine and lock a free slot. + my $selected = $available[0]; + my $machine = $selected->{machine}; + + $slotLock = openSlotLock($machine, $selected->{free}); + flock($slotLock, LOCK_EX | LOCK_NB) or die; + utime undef, undef, $slotLock; + + close MAINLOCK; + + + # Connect to the selected machine. + @sshOpts = ("-i", $machine->{sshKeys}, "-x"); + $hostName = $machine->{hostName}; + if (openSSHConnection($hostName)) { + last REQ if system("ssh $hostName @sshOpts nix-builds-inhibited < /dev/null > /dev/null 2>&1") != 0; + warn "machine `$hostName' is refusing builds, trying other available machines...\n"; + closeSSHConnection; + } else { + warn "unable to open SSH connection to `$hostName', trying other available machines...\n"; + } + $machine->{enabled} = 0; + } +} + + +# Tell Nix we've accepted the build. +sendReply "accept"; +my @inputs = split /\s/, readline(STDIN); +my @outputs = split /\s/, readline(STDIN); + + +print STDERR "@ build-remote $drvPath $hostName\n" if $printBuildTrace; + + +my $maybeSign = ""; +$maybeSign = "--sign" if -e "$Nix::Config::confDir/signing-key.sec"; + + +# Register the derivation as a temporary GC root. Note that $PPID is +# the PID of the remote SSH process, which, due to the use of a +# persistant SSH connection, should be the same across all remote +# command invocations for this session. +my $rootsDir = "@localstatedir@/nix/gcroots/tmp"; +system("ssh $hostName @sshOpts 'mkdir -m 1777 -p $rootsDir; ln -sfn $drvPath $rootsDir/\$PPID.drv'"); + +sub removeRoots { + system("ssh $hostName @sshOpts 'rm -f $rootsDir/\$PPID.drv $rootsDir/\$PPID.out'"); +} + + +# Copy the derivation and its dependencies to the build machine. This +# is guarded by an exclusive lock per machine to prevent multiple +# build-remote instances from copying to a machine simultaneously. +# That's undesirable because we may end up with N instances uploading +# the same missing path simultaneously, causing the effective network +# bandwidth and target disk speed to be divided by N. +my $uploadLock = "$currentLoad/$hostName.upload-lock"; +sysopen UPLOADLOCK, "$uploadLock", O_RDWR|O_CREAT, 0600 or die; +eval { + local $SIG{ALRM} = sub { die "alarm\n" }; + # Don't wait forever, so that a process that gets stuck while + # holding the lock doesn't block everybody else indefinitely. + # It's safe to continue after a timeout, just (potentially) + # inefficient. + alarm 15 * 60; + flock(UPLOADLOCK, LOCK_EX); + alarm 0; +}; +if ($@) { + die unless $@ eq "alarm\n"; + print STDERR "somebody is hogging $uploadLock, continuing...\n"; + unlink $uploadLock; +} +Nix::CopyClosure::copyTo($hostName, [ @sshOpts ], [ $drvPath, @inputs ], "", "", 0, 0, $maybeSign ne "", ""); +close UPLOADLOCK; + + +# Perform the build. +my $buildFlags = + "--max-silent-time $maxSilentTime --option build-timeout $buildTimeout" + . " --fallback --add-root $rootsDir/\$PPID.out --quiet" + . " --option build-keep-log false --option build-use-substitutes false"; + +# We let the remote side kill its process group when the connection is +# closed unexpectedly. This is necessary to ensure that no processes +# are left running on the remote system if the local Nix process is +# killed. (SSH itself doesn't kill child processes if the connection +# is interrupted unless the `-tt' flag is used to force a pseudo-tty, +# in which case every child receives SIGHUP; however, `-tt' doesn't +# work on some platforms when connection sharing is used.) +print STDERR "building `$drvPath' on `$hostName'\n"; +pipe STDIN, DUMMY; # make sure we have a readable STDIN +if (system("exec ssh $hostName @sshOpts '(read; kill -INT -\$\$) <&0 & exec nix-store -r $drvPath $buildFlags > /dev/null' 2>&4") != 0) { + # Note that if we get exit code 100 from `nix-store -r', it + # denotes a permanent build failure (as opposed to an SSH problem + # or a temporary Nix problem). We propagate this to the caller to + # allow it to distinguish between transient and permanent + # failures. + my $res = $? >> 8; + print STDERR "build of `$drvPath' on `$hostName' failed with exit code $res\n"; + removeRoots; + exit $res; +} + +#print "build of `$drvPath' on `$hostName' succeeded\n"; + + +# Copy the output from the build machine. +my @outputs2 = grep { !isValidPath($_) } @outputs; +if (scalar @outputs2 > 0) { + system("exec ssh $hostName @sshOpts 'nix-store --export @outputs2'" . + "| NIX_HELD_LOCKS='@outputs2' @bindir@/nix-store --import > /dev/null") == 0 + or die("cannot copy paths " . join(", ", @outputs) . " from `$hostName': $?"); +} + + +# Get rid of the temporary GC roots. +removeRoots; |