#! @perl@ -w -I@libexecdir@/nix

use strict;
use File::Temp qw(tempdir);
use readmanifest;


# Some hard-coded options.
my $maxNarSize = 100 * 1024 * 1024; # max size of NAR archives to generate patches for
my $maxPatchFraction = 0.60; # if patch is bigger than this fraction of full archive, reject


die unless scalar @ARGV == 5;

my $hashAlgo = "sha256";

my $cacheDir = $ARGV[0];
my $patchesDir = $ARGV[1];
my $patchesURL = $ARGV[2];
my $srcDir = $ARGV[3];
my $dstDir = $ARGV[4];

my $tmpDir = tempdir("nix-generate-patches.XXXXXX", CLEANUP => 1, TMPDIR => 1)
    or die "cannot create a temporary directory";

print "TEMP = $tmpDir\n";

#END { rmdir $tmpDir; }

my %srcNarFiles;
my %srcPatches;
my %srcSuccessors;

my %dstNarFiles;
my %dstPatches;
my %dstSuccessors;

readManifest "$srcDir/MANIFEST",
    \%srcNarFiles, \%srcPatches, \%srcSuccessors;

readManifest "$dstDir/MANIFEST",
    \%dstNarFiles, \%dstPatches, \%dstSuccessors;


sub findOutputPaths {
    my $narFiles = shift;
    my $successors = shift;

    my %outPaths;
    
    foreach my $p (keys %{$narFiles}) {

        # Ignore store expressions.
        next if ($p =~ /\.store$/);
        next if ($p =~ /\.drv$/);
        
        # Ignore builders (too much ambiguity -- they're all called
        # `builder.sh').
        next if ($p =~ /\.sh$/);
        next if ($p =~ /\.patch$/);
        
        # Don't bother including tar files etc.
        next if ($p =~ /\.tar\.(gz|bz2)$/ || $p =~ /\.zip$/ || $p =~ /\.bin$/);

        $outPaths{$p} = 1;
    }

    return %outPaths;
}

print "finding src output paths...\n";
my %srcOutPaths = findOutputPaths \%srcNarFiles, \%srcSuccessors;

print "finding dst output paths...\n";
my %dstOutPaths = findOutputPaths \%dstNarFiles, \%dstSuccessors;


sub getNameVersion {
    my $p = shift;
    $p =~ /\/[0-9a-z]+((?:-[a-zA-Z][^\/-]*)+)([^\/]*)$/;
    my $name = $1;
    my $version = $2;
    $name =~ s/^-//;
    $version =~ s/^-//;
    return ($name, $version);
}


# A quick hack to get a measure of the `distance' between two
# versions: it's just the position of the first character that differs
# (or 999 if they are the same).
sub versionDiff {
    my $s = shift;
    my $t = shift;
    my $i;
    return 999 if $s eq $t;
    for ($i = 0; $i < length $s; $i++) {
        return $i if $i >= length $t or
            substr($s, $i, 1) ne substr($t, $i, 1);
    }
    return $i;
}


sub getNarBz2 {
    my $narFiles = shift;
    my $storePath = shift;
    
    my $narFileList = $$narFiles{$storePath};
    die "missing store expression $storePath" unless defined $narFileList;

    my $narFile = @{$narFileList}[0];
    die unless defined $narFile;

    $narFile->{url} =~ /\/([^\/]+)$/;
    die unless defined $1;
    return "$cacheDir/$1";
}


sub containsPatch {
    my $patches = shift;
    my $storePath = shift;
    my $basePath = shift;
    my $patchList = $$patches{$storePath};
    return 0 if !defined $patchList;
    my $found = 0;
    foreach my $patch (@{$patchList}) {
        # !!! baseHash might differ
        return 1 if $patch->{basePath} eq $basePath;
    }
    return 0;
}


# Compute the "weighted" number of uses of a path in the build graph.
sub computeUses {
    my $narFiles = shift;
    my $path = shift;

    # Find the deriver of $path.
    return 1 unless defined $$narFiles{$path};
    my $deriver = @{$$narFiles{$path}}[0]->{deriver};
    return 1 unless defined $deriver && $deriver ne "";

#    print "  DERIVER $deriver\n";

    # Optimisation: build the referrers graph from the references
    # graph.
    my %referrers;
    foreach my $q (keys %{$narFiles}) {
        my @refs = split " ", @{$$narFiles{$q}}[0]->{references};
        foreach my $r (@refs) {
            $referrers{$r} = [] unless defined $referrers{$r};
            push @{$referrers{$r}}, $q;
        }
    }

    # Determine the shortest path from $deriver to all other reachable
    # paths in the `referrers' graph.

    my %dist;
    $dist{$deriver} = 0;

    my @queue = ($deriver);
    my $pos = 0;
    
    while ($pos < scalar @queue) {
        my $p = $queue[$pos];
        $pos++;

        foreach my $q (@{$referrers{$p}}) {
            if (!defined $dist{$q}) {
                $dist{$q} = $dist{$p} + 1;
#                print "    $q $dist{$q}\n";
                push @queue, $q;
            }
        }
    }

    my $wuse = 1.0;
    foreach my $user (keys %dist) {
        next if $user eq $deriver;
#        print "    $user $dist{$user}\n";
        $wuse += 1.0 / 2.0**$dist{$user};
    }

#    print "    XXX $path $wuse\n";
    
    return $wuse;
}


# For each output path in the destination, see if we need to / can
# create a patch.

print "creating patches...\n";

foreach my $p (keys %dstOutPaths) {

    # If exactly the same path already exists in the source, skip it.
    next if defined $srcOutPaths{$p};
    
    print "  $p\n";

    # If not, then we should find the paths in the source that are
    # `most' likely to be present on a system that wants to install
    # this path.

    (my $name, my $version) = getNameVersion $p;

    my @closest = ();
    my $closestVersion;
    my $minDist = -1; # actually, larger means closer

    # Find all source paths with the same name.

    foreach my $q (keys %srcOutPaths) {
        (my $name2, my $version2) = getNameVersion $q;
        if ($name eq $name2) {

            # If the sizes differ too much, then skip.  This
            # disambiguates between, e.g., a real component and a
            # wrapper component (cf. Firefox in Nixpkgs).
            my $srcSize = @{$srcNarFiles{$q}}[0]->{size};
            my $dstSize = @{$dstNarFiles{$p}}[0]->{size};
            my $ratio = $srcSize / $dstSize;
            $ratio = 1 / $ratio if $ratio < 1;
#            print "  SIZE $srcSize $dstSize $ratio $q\n";

            if ($ratio >= 3) {
                print "  SKIPPING $q due to size ratio $ratio ($srcSize $dstSize)\n";
                next;
            }

            # If the numbers of weighted uses differ too much, then
            # skip.  This disambiguates between, e.g., the bootstrap
            # GCC and the final GCC in Nixpkgs.
            my $srcUses = computeUses \%srcNarFiles, $q;
            my $dstUses = computeUses \%dstNarFiles, $p;
            $ratio = $srcUses / $dstUses;
            $ratio = 1 / $ratio if $ratio < 1;
            print "  USE $srcUses $dstUses $ratio $q\n";
            
#            if ($ratio >= 2) {
#                print "  SKIPPING $q due to use ratio $ratio ($srcUses $dstUses)\n";
#                next;
#            }

            # If there are multiple matching names, include the ones
            # with the closest version numbers.
            my $dist = versionDiff $version, $version2;
            if ($dist > $minDist) {
                $minDist = $dist;
                @closest = ($q);
                $closestVersion = $version2;
            } elsif ($dist == $minDist) {
                push @closest, $q;
            }
        }
    }

    if (scalar(@closest) == 0) {
        print "  NO BASE: $p\n";
        next;
    }

    foreach my $closest (@closest) {

        # Generate a patch between $closest and $p.
        print "  $p <- $closest\n";

        # If the patch already exists, skip it.
        if (containsPatch(\%srcPatches, $p, $closest) ||
            containsPatch(\%dstPatches, $p, $closest))
        {
            print "    skipping, already exists\n";
            next;
        }

#        next;
        
        my $srcNarBz2 = getNarBz2 \%srcNarFiles, $closest;
        my $dstNarBz2 = getNarBz2 \%dstNarFiles, $p;

        system("@bunzip2@ < $srcNarBz2 > $tmpDir/A") == 0
            or die "cannot unpack $srcNarBz2";

        if ((stat "$tmpDir/A")[7] >= $maxNarSize) {
            print "    skipping, source is too large\n";
            next;
        }
        
        system("@bunzip2@ < $dstNarBz2 > $tmpDir/B") == 0
            or die "cannot unpack $dstNarBz2";

        if ((stat "$tmpDir/B")[7] >= $maxNarSize) {
            print "    skipping, destination is too large\n";
            next;
        }
        
        system("@libexecdir@/bsdiff $tmpDir/A $tmpDir/B $tmpDir/DIFF") == 0
            or die "cannot compute binary diff";

        my $baseHash = `@bindir@/nix-hash --flat --type $hashAlgo --base32 $tmpDir/A` or die;
        chomp $baseHash;

        my $narHash = `@bindir@/nix-hash --flat --type $hashAlgo --base32 $tmpDir/B` or die;
        chomp $narHash;

        my $narDiffHash = `@bindir@/nix-hash --flat --type $hashAlgo --base32 $tmpDir/DIFF` or die;
        chomp $narDiffHash;

        my $narDiffSize = (stat "$tmpDir/DIFF")[7];
        my $dstNarBz2Size = (stat $dstNarBz2)[7];

        print "    size $narDiffSize; full size $dstNarBz2Size\n";
        
        if ($narDiffSize >= $dstNarBz2Size) {
            print "    rejecting; patch bigger than full archive\n";
            next;
        }
    
        if ($narDiffSize / $dstNarBz2Size >= $maxPatchFraction) {
            print "    rejecting; patch too large relative to full archive\n";
            next;
        }
    
        my $finalName =
            "$narDiffHash.nar-bsdiff";

        if (-e "$patchesDir/$finalName") {
            print "    not copying, already exists\n";
        }

        else {

            system("cp '$tmpDir/DIFF' '$patchesDir/$finalName.tmp'") == 0
                or die "cannot copy diff";
            
            rename("$patchesDir/$finalName.tmp", "$patchesDir/$finalName")
                or die "cannot rename $patchesDir/$finalName.tmp";
            
        }
        
        # Add the patch to the manifest.
        addPatch \%dstPatches, $p,
            { url => "$patchesURL/$finalName", hash => "$hashAlgo:$narDiffHash"
            , size => $narDiffSize, basePath => $closest, baseHash => "$hashAlgo:$baseHash"
            , narHash => "$hashAlgo:$narHash", patchType => "nar-bsdiff"
            }, 0;
    }
}


# Add in any potentially useful patches in the source (namely, those
# patches that produce either paths in the destination or paths that
# can be used as the base for other useful patches).

print "propagating patches...\n";

my $changed;
do {
    # !!! we repeat this to reach the transitive closure; inefficient
    $changed = 0;

    print "loop\n";

    my %dstBasePaths;
    foreach my $q (keys %dstPatches) {
	foreach my $patch (@{$dstPatches{$q}}) {
	    $dstBasePaths{$patch->{basePath}} = 1;
	}
    }

    foreach my $p (keys %srcPatches) {
        my $patchList = $srcPatches{$p};

        my $include = 0;

        # Is path $p included in the destination?  If so, include
        # patches that produce it.
        $include = 1 if defined $dstNarFiles{$p};

        # Is path $p a path that serves as a base for paths in the
        # destination?  If so, include patches that produce it.
	# !!! check baseHash
        $include = 1 if defined $dstBasePaths{$p};

        if ($include) {
            foreach my $patch (@{$patchList}) {
                $changed = 1 if addPatch \%dstPatches, $p, $patch;
            }
        }
        
    }
    
} while $changed;


# Rewrite the manifest of the destination (with the new patches).
writeManifest "$dstDir/MANIFEST",
    \%dstNarFiles, \%dstPatches, \%dstSuccessors;