#! @perl@ -w @perlFlags@ use Fcntl qw(:DEFAULT :flock); use English '-no_match_vars'; use IO::Handle; use Nix::Config; use Nix::SSH qw/sshOpts openSSHConnection/; use Nix::CopyClosure; use Nix::Store; no warnings('once'); # General operation: # # Try to find a free machine of type $neededSystem. We do this as # follows: # - We acquire an exclusive lock on $currentLoad/main-lock. # - For each machine $machine of type $neededSystem and for each $slot # less than the maximum load for that machine, we try to get an # exclusive lock on $currentLoad/$machine-$slot (without blocking). # If we get such a lock, we send "accept" to the caller. Otherwise, # we send "postpone" and exit. # - We release the exclusive lock on $currentLoad/main-lock. # - We perform the build on $neededSystem. # - We release the exclusive lock on $currentLoad/$machine-$slot. # # The nice thing about this scheme is that if we die prematurely, the # locks are released automatically. # Make sure that we don't get any SSH passphrase or host key popups - # if there is any problem it should fail, not do something # interactive. $ENV{"DISPLAY"} = ""; $ENV{"SSH_ASKPASS"} = ""; sub sendReply { my $reply = shift; print STDERR "# $reply\n"; } sub all { $_ || return 0 for @_; 1 } # Initialisation. my $loadIncreased = 0; my ($localSystem, $maxSilentTime, $printBuildTrace, $buildTimeout) = @ARGV; my $currentLoad = $ENV{"NIX_CURRENT_LOAD"}; my $conf = $ENV{"NIX_REMOTE_SYSTEMS"}; sub openSlotLock { my ($machine, $slot) = @_; my $slotLockFn = "$currentLoad/" . (join '+', @{$machine->{systemTypes}}) . "-" . $machine->{hostName} . "-$slot"; my $slotLock = new IO::Handle; sysopen $slotLock, "$slotLockFn", O_RDWR|O_CREAT, 0600 or die; return $slotLock; } # Read the list of machines. my @machines; if (defined $conf && -e $conf) { open CONF, "<$conf" or die; while (<CONF>) { chomp; s/\#.*$//g; next if /^\s*$/; my @tokens = split /\s/, $_; my @supportedFeatures = split(/,/, $tokens[5] || ""); my @mandatoryFeatures = split(/,/, $tokens[6] || ""); push @machines, { hostName => $tokens[0] , systemTypes => [ split(/,/, $tokens[1]) ] , sshKeys => $tokens[2] , maxJobs => int($tokens[3]) , speedFactor => 1.0 * (defined $tokens[4] ? int($tokens[4]) : 1) , supportedFeatures => [ @supportedFeatures, @mandatoryFeatures ] , mandatoryFeatures => [ @mandatoryFeatures ] , enabled => 1 }; } close CONF; } # Wait for the calling process to ask us whether we can build some derivation. my ($drvPath, $hostName, $slotLock); REQ: while (1) { $_ = <STDIN> || exit 0; (my $amWilling, my $neededSystem, $drvPath, my $requiredFeatures) = split; my @requiredFeatures = split /,/, $requiredFeatures; my $canBuildLocally = $amWilling && ($localSystem eq $neededSystem); if (!defined $currentLoad) { sendReply "decline"; next; } # Acquire the exclusive lock on $currentLoad/main-lock. mkdir $currentLoad, 0777 or die unless -d $currentLoad; my $mainLock = "$currentLoad/main-lock"; sysopen MAINLOCK, "$mainLock", O_RDWR|O_CREAT, 0600 or die; flock(MAINLOCK, LOCK_EX) or die; while (1) { # Find all machine that can execute this build, i.e., that # support builds for the given platform and features, and are # not at their job limit. my $rightType = 0; my @available = (); LOOP: foreach my $cur (@machines) { if ($cur->{enabled} && (grep { $neededSystem eq $_ } @{$cur->{systemTypes}}) && all(map { my $f = $_; 0 != grep { $f eq $_ } @{$cur->{supportedFeatures}} } (@requiredFeatures, @mandatoryFeatures)) && all(map { my $f = $_; 0 != grep { $f eq $_ } @requiredFeatures } @{$cur->{mandatoryFeatures}}) ) { $rightType = 1; # We have a machine of the right type. Determine the load on # the machine. my $slot = 0; my $load = 0; my $free; while ($slot < $cur->{maxJobs}) { my $slotLock = openSlotLock($cur, $slot); if (flock($slotLock, LOCK_EX | LOCK_NB)) { $free = $slot unless defined $free; flock($slotLock, LOCK_UN) or die; } else { $load++; } close $slotLock; $slot++; } push @available, { machine => $cur, load => $load, free => $free } if $load < $cur->{maxJobs}; } } if (defined $ENV{NIX_DEBUG_HOOK}) { print STDERR "load on " . $_->{machine}->{hostName} . " = " . $_->{load} . "\n" foreach @available; } # Didn't find any available machine? Then decline or postpone. if (scalar @available == 0) { # Postpone if we have a machine of the right type, except # if the local system can and wants to do the build. if ($rightType && !$canBuildLocally) { sendReply "postpone"; } else { sendReply "decline"; } close MAINLOCK; next REQ; } # Prioritise the available machines as follows: # - First by load divided by speed factor, rounded to the nearest # integer. This causes fast machines to be preferred over slow # machines with similar loads. # - Then by speed factor. # - Finally by load. sub lf { my $x = shift; return int($x->{load} / $x->{machine}->{speedFactor} + 0.4999); } @available = sort { lf($a) <=> lf($b) || $b->{machine}->{speedFactor} <=> $a->{machine}->{speedFactor} || $a->{load} <=> $b->{load} } @available; # Select the best available machine and lock a free slot. my $selected = $available[0]; my $machine = $selected->{machine}; $slotLock = openSlotLock($machine, $selected->{free}); flock($slotLock, LOCK_EX | LOCK_NB) or die; utime undef, undef, $slotLock; close MAINLOCK; # Connect to the selected machine. @sshOpts = ("-i", $machine->{sshKeys}, "-x"); $hostName = $machine->{hostName}; last REQ if openSSHConnection $hostName; warn "unable to open SSH connection to $hostName, trying other available machines...\n"; $machine->{enabled} = 0; } } # Tell Nix we've accepted the build. sendReply "accept"; my @inputs = split /\s/, readline(STDIN); my @outputs = split /\s/, readline(STDIN); print STDERR "building `$drvPath' on `$hostName'\n"; print STDERR "@ build-remote $drvPath $hostName\n" if $printBuildTrace; my $maybeSign = ""; $maybeSign = "--sign" if -e "$Nix::Config::confDir/signing-key.sec"; # Register the derivation as a temporary GC root. Note that $PPID is # the PID of the remote SSH process, which, due to the use of a # persistant SSH connection, should be the same across all remote # command invocations for this session. my $rootsDir = "@localstatedir@/nix/gcroots/tmp"; system("ssh $hostName @sshOpts 'mkdir -m 1777 -p $rootsDir; ln -sfn $drvPath $rootsDir/\$PPID.drv'"); sub removeRoots { system("ssh $hostName @sshOpts 'rm -f $rootsDir/\$PPID.drv $rootsDir/\$PPID.out'"); } # Copy the derivation and its dependencies to the build machine. This # is guarded by an exclusive lock per machine to prevent multiple # build-remote instances from copying to a machine simultaneously. # That's undesirable because we may end up with N instances uploading # the same missing path simultaneously, causing the effective network # bandwidth and target disk speed to be divided by N. my $uploadLock = "$currentLoad/$hostName.upload-lock"; sysopen UPLOADLOCK, "$uploadLock", O_RDWR|O_CREAT, 0600 or die; eval { local $SIG{ALRM} = sub { die "alarm\n" }; # Don't wait forever, so that a process that gets stuck while # holding the lock doesn't block everybody else indefinitely. # It's safe to continue after a timeout, just (potentially) # inefficient. alarm 15 * 60; flock(UPLOADLOCK, LOCK_EX); alarm 0; }; if ($@) { die unless $@ eq "alarm\n"; print STDERR "somebody is hogging $uploadLock, continuing...\n"; unlink $uploadLock; } Nix::CopyClosure::copyTo($hostName, [ @sshOpts ], [ $drvPath, @inputs ], "", "", 0, 0, $maybeSign ne "", ""); close UPLOADLOCK; # Perform the build. my $buildFlags = "--max-silent-time $maxSilentTime --option build-timeout $buildTimeout" . " --fallback --add-root $rootsDir/\$PPID.out --quiet" . " --option build-keep-log false"; # We let the remote side kill its process group when the connection is # closed unexpectedly. This is necessary to ensure that no processes # are left running on the remote system if the local Nix process is # killed. (SSH itself doesn't kill child processes if the connection # is interrupted unless the `-tt' flag is used to force a pseudo-tty, # in which case every child receives SIGHUP; however, `-tt' doesn't # work on some platforms when connection sharing is used.) pipe STDIN, DUMMY; # make sure we have a readable STDIN if (system("exec ssh $hostName @sshOpts '(read; kill -INT -\$\$) <&0 & nix-store -r $drvPath $buildFlags > /dev/null' 2>&4") != 0) { # Note that if we get exit code 100 from `nix-store -r', it # denotes a permanent build failure (as opposed to an SSH problem # or a temporary Nix problem). We propagate this to the caller to # allow it to distinguish between transient and permanent # failures. my $res = $? >> 8; print STDERR "build of `$drvPath' on `$hostName' failed with exit code $res\n"; removeRoots; exit $res; } #print "build of `$drvPath' on `$hostName' succeeded\n"; # Copy the output from the build machine. foreach my $output (@outputs) { my $maybeSignRemote = ""; $maybeSignRemote = "--sign" if $UID != 0; next if isValidPath($output); system("exec ssh $hostName @sshOpts 'nix-store --export $maybeSignRemote $output'" . "| NIX_HELD_LOCKS=$output @bindir@/nix-store --import > /dev/null") == 0 or die "cannot copy $output from $hostName: $?"; } # Get rid of the temporary GC roots. removeRoots;