about summary refs log blame commit diff
path: root/scripts/build-remote.pl.in
blob: 73d7abe37d324750400dff894a3e716fc7439296 (plain) (tree)
1
2
3
4
5
6
7
8
9
                        
 
                              
                             
               
                
             
                     
               
                    
 









                                                                      
                                






                                                                     







                                                                     

                      
                              

 

                                    
 


                      
                                                                           
 
                                           
                                      
 




                                                                                                                      
                                                                  

                     

 
                            
             
                                
                               



                        
                                    

                                                             
                       




                                                                             

                                                                             



                          

 

 

                                                                              
                
 

                           
                                                                              
                                                        
 
                                                                        
 



                                
 


                                                           
                                                               
                                    

 

                                                                  

                                                                     


                                           

                                                                        


                                                                                                                                       
             
















                                                                             
                 
 

                                                                                  
             
         
 



                                                                                           

 






                                                                       
                                    


                           
         
 
 











                                                                                                 

 
                                                                 
                                     
                                           
 


                                                              
 
                       
 
 
                                          
                                                  
                                         
              
                                                                            






                                                                                                  

                                
 


                                    


                                          
 
 


                                                                       
                   
                                                                    
 
 






                                                                      















                                                                    
                                                                                                               
                 
 
 
                    
                                                    





                                           




                                                                      

                                                  

              
 

                                         

                                                  

                                         


                                                       
 
#! @perl@ -w @perlFlags@

use Fcntl qw(:DEFAULT :flock);
use English '-no_match_vars';
use IO::Handle;
use Nix::Config;
use Nix::SSH;
use Nix::CopyClosure;
use Nix::Store;
no warnings('once');


# General operation:
#
# Try to find a free machine of type $neededSystem.  We do this as
# follows:
# - We acquire an exclusive lock on $currentLoad/main-lock.
# - For each machine $machine of type $neededSystem and for each $slot
#   less than the maximum load for that machine, we try to get an
#   exclusive lock on $currentLoad/$machine-$slot (without blocking).
#   If we get such a lock, we send "accept" to the caller.  Otherwise,
#   we send "postpone" and exit.
# - We release the exclusive lock on $currentLoad/main-lock.
# - We perform the build on $neededSystem.
# - We release the exclusive lock on $currentLoad/$machine-$slot.
#
# The nice thing about this scheme is that if we die prematurely, the
# locks are released automatically.


# Make sure that we don't get any SSH passphrase or host key popups -
# if there is any problem it should fail, not do something
# interactive.
$ENV{"DISPLAY"} = "";
$ENV{"SSH_ASKPASS"} = "";


sub sendReply {
    my $reply = shift;
    print STDERR "# $reply\n";
}

sub all { $_ || return 0 for @_; 1 }


# Initialisation.
my $loadIncreased = 0;

my ($localSystem, $maxSilentTime, $printBuildTrace, $buildTimeout) = @ARGV;

my $currentLoad = $ENV{"NIX_CURRENT_LOAD"};
my $conf = $ENV{"NIX_REMOTE_SYSTEMS"};


sub openSlotLock {
    my ($machine, $slot) = @_;
    my $slotLockFn = "$currentLoad/" . (join '+', @{$machine->{systemTypes}}) . "-" . $machine->{hostName} . "-$slot";
    my $slotLock = new IO::Handle;
    sysopen $slotLock, "$slotLockFn", O_RDWR|O_CREAT, 0600 or die;
    return $slotLock;
}


# Read the list of machines.
my @machines;
if (defined $conf && -e $conf) {
    open CONF, "<$conf" or die;
    while (<CONF>) {
        chomp;
        s/\#.*$//g;
        next if /^\s*$/;
        my @tokens = split /\s/, $_;
        my @supportedFeatures = split(/,/, $tokens[5] || "");
        my @mandatoryFeatures = split(/,/, $tokens[6] || "");
        push @machines,
            { hostName => $tokens[0]
            , systemTypes => [ split(/,/, $tokens[1]) ]
            , sshKeys => $tokens[2]
            , maxJobs => int($tokens[3])
            , speedFactor => 1.0 * (defined $tokens[4] ? int($tokens[4]) : 1)
            , supportedFeatures => [ @supportedFeatures, @mandatoryFeatures ]
            , mandatoryFeatures => [ @mandatoryFeatures ]
            , enabled => 1
            };
    }
    close CONF;
}



# Wait for the calling process to ask us whether we can build some derivation.
my ($drvPath, $hostName, $slotLock);
my ($from, $to);

REQ: while (1) {
    $_ = <STDIN> || exit 0;
    (my $amWilling, my $neededSystem, $drvPath, my $requiredFeatures) = split;
    my @requiredFeatures = split /,/, $requiredFeatures;

    my $canBuildLocally = $amWilling && ($localSystem eq $neededSystem);

    if (!defined $currentLoad) {
        sendReply "decline";
        next;
    }

    # Acquire the exclusive lock on $currentLoad/main-lock.
    mkdir $currentLoad, 0777 or die unless -d $currentLoad;
    my $mainLock = "$currentLoad/main-lock";
    sysopen MAINLOCK, "$mainLock", O_RDWR|O_CREAT, 0600 or die;
    flock(MAINLOCK, LOCK_EX) or die;


    while (1) {
        # Find all machine that can execute this build, i.e., that
        # support builds for the given platform and features, and are
        # not at their job limit.
        my $rightType = 0;
        my @available = ();
        LOOP: foreach my $cur (@machines) {
            if ($cur->{enabled}
                && (grep { $neededSystem eq $_ } @{$cur->{systemTypes}})
                && all(map { my $f = $_; 0 != grep { $f eq $_ } @{$cur->{supportedFeatures}} } (@requiredFeatures, @mandatoryFeatures))
                && all(map { my $f = $_; 0 != grep { $f eq $_ } @requiredFeatures } @{$cur->{mandatoryFeatures}})
                )
            {
                $rightType = 1;

                # We have a machine of the right type.  Determine the load on
                # the machine.
                my $slot = 0;
                my $load = 0;
                my $free;
                while ($slot < $cur->{maxJobs}) {
                    my $slotLock = openSlotLock($cur, $slot);
                    if (flock($slotLock, LOCK_EX | LOCK_NB)) {
                        $free = $slot unless defined $free;
                        flock($slotLock, LOCK_UN) or die;
                    } else {
                        $load++;
                    }
                    close $slotLock;
                    $slot++;
                }

                push @available, { machine => $cur, load => $load, free => $free }
                if $load < $cur->{maxJobs};
            }
        }

        if (defined $ENV{NIX_DEBUG_HOOK}) {
            print STDERR "load on " . $_->{machine}->{hostName} . " = " . $_->{load} . "\n"
                foreach @available;
        }


        # Didn't find any available machine?  Then decline or postpone.
        if (scalar @available == 0) {
            # Postpone if we have a machine of the right type, except
            # if the local system can and wants to do the build.
            if ($rightType && !$canBuildLocally) {
                sendReply "postpone";
            } else {
                sendReply "decline";
            }
            close MAINLOCK;
            next REQ;
        }


        # Prioritise the available machines as follows:
        # - First by load divided by speed factor, rounded to the nearest
        #   integer.  This causes fast machines to be preferred over slow
        #   machines with similar loads.
        # - Then by speed factor.
        # - Finally by load.
        sub lf { my $x = shift; return int($x->{load} / $x->{machine}->{speedFactor} + 0.4999); }
        @available = sort
            { lf($a) <=> lf($b)
                  || $b->{machine}->{speedFactor} <=> $a->{machine}->{speedFactor}
                  || $a->{load} <=> $b->{load}
            } @available;


        # Select the best available machine and lock a free slot.
        my $selected = $available[0];
        my $machine = $selected->{machine};

        $slotLock = openSlotLock($machine, $selected->{free});
        flock($slotLock, LOCK_EX | LOCK_NB) or die;
        utime undef, undef, $slotLock;

        close MAINLOCK;


        # Connect to the selected machine.
        my @sshOpts = ("-i", $machine->{sshKeys});
        $hostName = $machine->{hostName};
        eval {
            ($from, $to) = connectToRemoteNix($hostName, \@sshOpts, "2>&4");
            # FIXME: check if builds are inhibited.
        };
        last REQ unless $@;
        print STDERR "$@";
        warn "unable to open SSH connection to `$hostName', trying other available machines...\n";
        $from = undef;
        $to = undef;
        $machine->{enabled} = 0;
    }
}


# Tell Nix we've accepted the build.
sendReply "accept";
my @inputs = split /\s/, readline(STDIN);
my @outputs = split /\s/, readline(STDIN);


print STDERR "@ build-remote $drvPath $hostName\n" if $printBuildTrace;


my $maybeSign = "";
$maybeSign = "--sign" if -e "$Nix::Config::confDir/signing-key.sec";


# Copy the derivation and its dependencies to the build machine.  This
# is guarded by an exclusive lock per machine to prevent multiple
# build-remote instances from copying to a machine simultaneously.
# That's undesirable because we may end up with N instances uploading
# the same missing path simultaneously, causing the effective network
# bandwidth and target disk speed to be divided by N.
my $uploadLock = "$currentLoad/$hostName.upload-lock";
sysopen UPLOADLOCK, "$uploadLock", O_RDWR|O_CREAT, 0600 or die;
eval {
    local $SIG{ALRM} = sub { die "alarm\n" };
    # Don't wait forever, so that a process that gets stuck while
    # holding the lock doesn't block everybody else indefinitely.
    # It's safe to continue after a timeout, just (potentially)
    # inefficient.
    alarm 15 * 60;
    flock(UPLOADLOCK, LOCK_EX);
    alarm 0;
};
if ($@) {
    die unless $@ eq "alarm\n";
    print STDERR "somebody is hogging $uploadLock, continuing...\n";
    unlink $uploadLock;
}
Nix::CopyClosure::copyToOpen($from, $to, $hostName, [ $drvPath, @inputs ], "", "", 0, 0, $maybeSign ne "", "");
close UPLOADLOCK;


# Perform the build.
print STDERR "building `$drvPath' on `$hostName'\n";
writeInt(6, $to) or die; # == cmdBuildPaths
writeStrings([$drvPath], $to);
writeInt($maxSilentTime, $to);
writeInt($buildTimeout, $to);
my $res = readInt($from);
if ($res != 0) {
    # Note that if we get exit code 100 from `nix-store -r', it
    # denotes a permanent build failure (as opposed to an SSH problem
    # or a temporary Nix problem).  We propagate this to the caller to
    # allow it to distinguish between transient and permanent
    # failures.
    my $msg = readString($from);
    print STDERR "error: $msg (on `$hostName')\n";
    exit $res;
}


# Copy the output from the build machine.
my @outputs2 = grep { !isValidPath($_) } @outputs;
if (scalar @outputs2 > 0) {
    writeInt(5, $to); # == cmdExportPaths
    writeInt(0, $to); # don't sign
    writeStrings(\@outputs2, $to);
    $ENV{'NIX_HELD_LOCKS'} = "@outputs2"; # FIXME: ugly
    importPaths(fileno($from));
}