about summary refs log tree commit diff
path: root/scripts/show-duplication.pl
diff options
context:
space:
mode:
authorEelco Dolstra <e.dolstra@tudelft.nl>2006-09-19T13·53+0000
committerEelco Dolstra <e.dolstra@tudelft.nl>2006-09-19T13·53+0000
commit9488ae7357b718e09362c22f075cc5553c758214 (patch)
tree765fe2f7552291d213a09f35b7b78ec3fdf1714b /scripts/show-duplication.pl
parente0afaf18576e8c04110f4ae8499a311cae261215 (diff)
* `show-duplication.pl', a small utility that shows the amount of
  package duplication present in (e.g.) a profile.  It shows the
  number of instances of each package in a closure, along with the
  size in bytes of each instance as well as the "waste" (the
  difference between the sum of the sizes of all instances and the
  average size).

  $ ./show-duplication.pl /nix/var/nix/profiles/default
  gcc 11
    3.3.6 19293318
    3.4.4 21425257
    ...
    average 14942970, waste 149429707
  coreutils 6
  ...
  average package duplication 1.87628865979381, total size 3486330471, total waste 1335324237, 38.3017114443825% wasted

  This utility is useful for measuring the cost in terms of disk space
  of the Nix approach.

Diffstat (limited to 'scripts/show-duplication.pl')
-rwxr-xr-xscripts/show-duplication.pl73
1 files changed, 73 insertions, 0 deletions
diff --git a/scripts/show-duplication.pl b/scripts/show-duplication.pl
new file mode 100755
index 000000000000..0604c6696c7a
--- /dev/null
+++ b/scripts/show-duplication.pl
@@ -0,0 +1,73 @@
+#! /usr/bin/perl -w
+
+if (scalar @ARGV != 1) {
+    print "syntax: show-duplication.pl PATH\n";
+    exit 1;
+}
+
+my $root = $ARGV[0];
+
+
+my $nameRE = "(?:(?:[A-Za-z0-9\+\_]|(?:-[^0-9]))+)";
+my $versionRE = "(?:[A-Za-z0-9\.\-]+)";
+
+
+my %pkgInstances;
+
+
+my $pid = open(PATHS, "-|") || exec "nix-store", "-qR", $root;
+while (<PATHS>) {
+    chomp;
+    /^.*\/[0-9a-z]*-(.*)$/;
+    my $nameVersion = $1;
+    $nameVersion =~ /^($nameRE)(-($versionRE))?$/;
+    $name = $1;
+    $version = $3;
+    $version = "(unnumbered)" unless defined $version;
+#    print "$nameVersion $name $version\n";
+    push @{$pkgInstances{$name}}, {version => $version, path => $_};
+}
+close PATHS or exit 1;
+
+
+sub pathSize {
+    my $path = shift;
+    my @st = lstat $path or die;
+
+    my $size = $st[7];
+
+    if (-d $path) {
+        opendir DIR, $path or die;
+        foreach my $name (readdir DIR) {
+            next if $name eq "." || $name eq "..";
+            $size += pathSize("$path/$name");
+        }
+    }
+    
+    return $size;
+}
+
+
+my $totalPaths = 0;
+my $totalSize = 0, $totalWaste = 0;
+
+foreach my $name (sort {scalar @{$pkgInstances{$b}} <=> scalar @{$pkgInstances{$a}}} (keys %pkgInstances)) {
+    print "$name ", scalar @{$pkgInstances{$name}}, "\n";
+    my $allSize = 0;
+    foreach my $x (sort {$a->{version} cmp $b->{version}} @{$pkgInstances{$name}}) {
+        $totalPaths++;
+        my $size = pathSize $x->{path};
+        $allSize += $size;
+        print "    $x->{version} $size\n";
+    }
+    my $avgSize = int($allSize / scalar @{$pkgInstances{$name}});
+    my $waste = $allSize - $avgSize;
+    $totalSize += $allSize;
+    $totalWaste += $waste;
+    print "    average $avgSize, waste $waste\n";
+}
+
+
+my $avgDupl = $totalPaths / scalar (keys %pkgInstances);
+my $wasteFactor = ($totalWaste / $totalSize) * 100;
+print "average package duplication $avgDupl, total size $totalSize, total waste $totalWaste, $wasteFactor% wasted\n";