about summary refs log tree commit diff
diff options
context:
space:
mode:
authorEelco Dolstra <e.dolstra@tudelft.nl>2007-10-09T22·14+0000
committerEelco Dolstra <e.dolstra@tudelft.nl>2007-10-09T22·14+0000
commita8629de827e4d5a67372614727ce6fcc26423f8c (patch)
treebf1cffcf63a74e41ec48fb7e12918d57979fc763
parent27a0662828cb5ac9da198f35754750f12628d546 (diff)
* New command `nix-store --optimise' to reduce Nix store disk space
  usage by finding identical files in the store and hard-linking them
  to each other.  It typically reduces the size of the store by
  something like 25-35%.  This is what the optimise-store.pl script
  did, but the new command is faster and more correct (it's safe wrt
  garbage collection and concurrent builds).

-rwxr-xr-xscripts/optimise-store.pl91
-rw-r--r--src/libstore/local-store.cc125
-rw-r--r--src/libstore/local-store.hh18
-rw-r--r--src/nix-store/help.txt1
-rw-r--r--src/nix-store/nix-store.cc52
5 files changed, 188 insertions, 99 deletions
diff --git a/scripts/optimise-store.pl b/scripts/optimise-store.pl
deleted file mode 100755
index 41557e6d18..0000000000
--- a/scripts/optimise-store.pl
+++ /dev/null
@@ -1,91 +0,0 @@
-#! /usr/bin/perl -w
-
-use strict;
-use File::Basename;
-
-
-my @paths = ("/nix/store");
-
-
-print "hashing...\n";
-
-my $hashList = "/tmp/nix-optimise-hash-list";
-
-system("find @paths -type f -print0 | xargs -0 md5sum -- > $hashList") == 0
-    or die "cannot hash store files";
-
-
-print "sorting by hash...\n";
-
-system("sort $hashList > $hashList.sorted") == 0
-    or die "cannot sort list";
-
-
-sub atomicLink {
-    my $target = shift;
-    my $new = shift;
-    my $tmpNew = "${new}_optimise.$$";
-
-    # Make the directory writable temporarily.
-    my $dir = dirname $new;
-    my @st = stat $dir or die;
-
-    chmod ($st[2] | 0200, $dir) or die "cannot make `$dir' writable: $!";
-    
-    link $target, $tmpNew or die "cannot create hard link `$tmpNew': $!";
-
-    rename $tmpNew, $new or die "cannot rename `$tmpNew' to `$new': $!";
-
-    chmod ($st[2], $dir) or die "cannot restore permission on `$dir': $!";
-    utime ($st[8], $st[9], $dir) or die "cannot restore timestamp on `$dir': $!";
-}
-
-
-print "hard-linking...\n";
-
-open LIST, "<$hashList.sorted" or die;
-
-my $prevFile;
-my $prevHash;
-my $prevInode;
-my $prevExec;
-
-my $totalSpace = 0;
-my $savedSpace = 0;
-
-while (<LIST>) {
-    /^([0-9a-f]*)\s+(.*)$/ or die;
-    my $curFile = $2;
-    my $curHash = $1;
-
-    my @st = stat $curFile or die;
-    next if ($st[2] & 0222) != 0; # skip writable files
-
-    my $fileSize = $st[7];
-    $totalSpace += $fileSize;
-    my $isExec = ($st[2] & 0111) == 0111;
-
-    if (defined $prevHash && $curHash eq $prevHash
-        && $prevExec == $isExec)
-    {
-        
-        if ($st[1] != $prevInode) {
-            print "$curFile = $prevFile\n";
-            atomicLink $prevFile, $curFile;
-            $savedSpace += $fileSize;
-        }
-        
-    } else {
-        $prevFile = $curFile;
-        $prevHash = $curHash;
-        $prevInode = $st[1];
-        $prevExec = ($st[2] & 0111) == 0111;
-    }
-}
-
-print "total space = $totalSpace\n";
-print "saved space = $savedSpace\n";
-my $savings = ($savedSpace / $totalSpace) * 100.0;
-print "savings = $savings %\n";
-
-close LIST;
diff --git a/src/libstore/local-store.cc b/src/libstore/local-store.cc
index 4378f0ba61..c77ab3c6ce 100644
--- a/src/libstore/local-store.cc
+++ b/src/libstore/local-store.cc
@@ -174,7 +174,7 @@ void copyPath(const Path & src, const Path & dst, PathFilter & filter)
 }
 
 
-static void _canonicalisePathMetaData(const Path & path)
+static void _canonicalisePathMetaData(const Path & path, bool recurse)
 {
     checkInterrupt();
 
@@ -223,17 +223,17 @@ static void _canonicalisePathMetaData(const Path & path)
 
     }
 
-    if (S_ISDIR(st.st_mode)) {
+    if (recurse && S_ISDIR(st.st_mode)) {
         Strings names = readDirectory(path);
 	for (Strings::iterator i = names.begin(); i != names.end(); ++i)
-	    _canonicalisePathMetaData(path + "/" + *i);
+	    _canonicalisePathMetaData(path + "/" + *i, true);
     }
 }
 
 
 void canonicalisePathMetaData(const Path & path)
 {
-    _canonicalisePathMetaData(path);
+    _canonicalisePathMetaData(path, true);
 
     /* On platforms that don't have lchown(), the top-level path can't
        be a symlink, since we can't change its ownership. */
@@ -625,7 +625,7 @@ void LocalStore::exportPath(const Path & path, bool sign,
        consistent metadata. */
     Transaction txn(nixDB);
     addTempRoot(path);
-    if (!isValidPath(path))
+    if (!isValidPathTxn(txn, path))
         throw Error(format("path `%1%' is not valid") % path);
 
     HashAndWriteSink hashAndWriteSink(sink);
@@ -950,6 +950,121 @@ void verifyStore(bool checkContents)
 }
 
 
+typedef std::map<Hash, std::pair<Path, ino_t> > HashToPath;
+
+
+static void toggleWritable(const Path & path, bool writable)
+{
+    struct stat st;
+    if (lstat(path.c_str(), &st))
+	throw SysError(format("getting attributes of path `%1%'") % path);
+
+    mode_t mode = st.st_mode;
+    if (writable) mode |= S_IWUSR;
+    else mode &= ~(S_IWUSR | S_IWGRP | S_IWOTH);
+    
+    if (chmod(path.c_str(), mode) == -1)
+        throw SysError(format("changing writability of `%1%'") % path);
+}
+
+
+static void hashAndLink(bool dryRun, HashToPath & hashToPath,
+    OptimiseStats & stats, const Path & path)
+{
+    struct stat st;
+    if (lstat(path.c_str(), &st))
+	throw SysError(format("getting attributes of path `%1%'") % path);
+
+    /* Sometimes SNAFUs can cause files in the Nix store to be
+       modified, in particular when running programs as root under
+       NixOS (example: $fontconfig/var/cache being modified).  Skip
+       those files. */
+    if (S_ISREG(st.st_mode) && (st.st_mode & S_IWUSR)) {
+        printMsg(lvlError, format("skipping suspicious writable file `%1%'") % path);
+        return;
+    }
+
+    /* We can hard link regular files and symlinks. */
+    if (S_ISREG(st.st_mode) || S_ISLNK(st.st_mode)) {
+
+        /* Hash the file.  Note that hashPath() returns the hash over
+           the NAR serialisation, which includes the execute bit on
+           the file.  Thus, executable and non-executable files with
+           the same contents *won't* be linked (which is good because
+           otherwise the permissions would be screwed up).
+
+           Also note that if `path' is a symlink, then we're hashing
+           the contents of the symlink (i.e. the result of
+           readlink()), not the contents of the target (which may not
+           even exist). */
+        Hash hash = hashPath(htSHA256, path);
+        stats.totalFiles++;
+        printMsg(lvlDebug, format("`%1%' has hash `%2%'") % path % printHash(hash));
+
+        std::pair<Path, ino_t> prevPath = hashToPath[hash];
+        
+        if (prevPath.first == "") {
+            hashToPath[hash] = std::pair<Path, ino_t>(path, st.st_ino);
+            return;
+        }
+            
+        /* Yes!  We've seen a file with the same contents.  Replace
+           the current file with a hard link to that file. */
+        stats.sameContents++;
+        if (prevPath.second == st.st_ino) {
+            printMsg(lvlDebug, format("`%1%' is already linked to `%2%'") % path % prevPath.first);
+            return;
+        }
+        
+        printMsg(lvlTalkative, format("linking `%1%' to `%2%'") % path % prevPath.first);
+
+        Path tempLink = (format("%1%.tmp-%2%-%3%")
+            % path % getpid() % rand()).str();
+
+        toggleWritable(dirOf(path), true);
+        
+        if (link(prevPath.first.c_str(), tempLink.c_str()) == -1)
+            throw SysError(format("cannot link `%1%' to `%2%'")
+                % tempLink % prevPath.first);
+
+        /* Atomically replace the old file with the new hard link. */
+        if (rename(tempLink.c_str(), path.c_str()) == -1)
+            throw SysError(format("cannot rename `%1%' to `%2%'")
+                % tempLink % path);
+
+        /* Make the directory read-only again and reset its timestamp
+           back to 0. */
+        _canonicalisePathMetaData(dirOf(path), false);
+        
+        stats.filesLinked++;
+        stats.bytesFreed += st.st_size;
+    }
+
+    if (S_ISDIR(st.st_mode)) {
+        Strings names = readDirectory(path);
+	for (Strings::iterator i = names.begin(); i != names.end(); ++i)
+	    hashAndLink(dryRun, hashToPath, stats, path + "/" + *i);
+    }
+}
+
+
+void LocalStore::optimiseStore(bool dryRun, OptimiseStats & stats)
+{
+    HashToPath hashToPath;
+    
+    Paths paths;
+    PathSet validPaths;
+    nixDB.enumTable(noTxn, dbValidPaths, paths);
+
+    for (Paths::iterator i = paths.begin(); i != paths.end(); ++i) {
+        addTempRoot(*i);
+        if (!isValidPath(*i)) continue; /* path was GC'ed, probably */
+        startNest(nest, lvlChatty, format("hashing files in `%1%'") % *i);
+        hashAndLink(dryRun, hashToPath, stats, *i);
+    }
+}
+
+
 /* Upgrade from schema 1 (Nix <= 0.7) to schema 2 (Nix >= 0.8). */
 static void upgradeStore07()
 {
diff --git a/src/libstore/local-store.hh b/src/libstore/local-store.hh
index 8bd37bc0a3..6c366167f1 100644
--- a/src/libstore/local-store.hh
+++ b/src/libstore/local-store.hh
@@ -21,6 +21,20 @@ const int nixSchemaVersion = 4;
 extern string drvsLogDir;
 
 
+struct OptimiseStats
+{
+    unsigned long totalFiles;
+    unsigned long sameContents;
+    unsigned long filesLinked;
+    unsigned long long bytesFreed;
+    OptimiseStats()
+    {
+        totalFiles = sameContents = filesLinked = 0;
+        bytesFreed = 0;
+    }
+};
+
+
 class LocalStore : public StoreAPI
 {
 private:
@@ -83,6 +97,10 @@ public:
 
     void collectGarbage(GCAction action, const PathSet & pathsToDelete,
         bool ignoreLiveness, PathSet & result, unsigned long long & bytesFreed);
+
+    /* Optimise the disk space usage of the Nix store by hard-linking
+       files with the same contents. */
+    void optimiseStore(bool dryRun, OptimiseStats & stats);
 };
 
 
diff --git a/src/nix-store/help.txt b/src/nix-store/help.txt
index 0662f67962..14b83a06ca 100644
--- a/src/nix-store/help.txt
+++ b/src/nix-store/help.txt
@@ -21,6 +21,7 @@ Operations:
 
   --init: initialise the Nix database
   --verify: verify Nix structures
+  --optimise: optimise the Nix store by hard-linking identical files
 
   --version: output version information
   --help: display help
diff --git a/src/nix-store/nix-store.cc b/src/nix-store/nix-store.cc
index 176dc39f9b..678ce2ae9c 100644
--- a/src/nix-store/nix-store.cc
+++ b/src/nix-store/nix-store.cc
@@ -466,6 +466,13 @@ static void opCheckValidity(Strings opFlags, Strings opArgs)
 }
 
 
+static string showBytes(unsigned long long bytes)
+{
+    return (format("%d bytes (%.2f MiB)")
+        % bytes % (bytes / (1024.0 * 1024.0))).str();
+}
+
+
 struct PrintFreed 
 {
     bool show, dryRun;
@@ -477,9 +484,9 @@ struct PrintFreed
         if (show)
             cout << format(
                 (dryRun
-                    ? "%d bytes would be freed (%.2f MiB)\n"
-                    : "%d bytes freed (%.2f MiB)\n"))
-                % bytesFreed % (bytesFreed / (1024.0 * 1024.0));
+                    ? "%1% would be freed\n"
+                    : "%1% freed (%.2f MiB)\n"))
+                % showBytes(bytesFreed);
     }
 };
 
@@ -614,6 +621,43 @@ static void opVerify(Strings opFlags, Strings opArgs)
 }
 
 
+
+static void showOptimiseStats(OptimiseStats & stats)
+{
+    printMsg(lvlError,
+        format("%1% freed by hard-linking %2% files; there are %3% files with equal contents out of %4% files in total")
+        % showBytes(stats.bytesFreed)
+        % stats.filesLinked
+        % stats.sameContents
+        % stats.totalFiles);
+}
+
+
+/* Optimise the disk space usage of the Nix store by hard-linking
+   files with the same contents. */
+static void opOptimise(Strings opFlags, Strings opArgs)
+{
+    if (!opArgs.empty())
+        throw UsageError("no arguments expected");
+
+    for (Strings::iterator i = opFlags.begin();
+         i != opFlags.end(); ++i)
+        throw UsageError(format("unknown flag `%1%'") % *i);
+
+    LocalStore * store2(dynamic_cast<LocalStore *>(store.get()));
+    if (!store2) throw Error("you don't have sufficient rights to use --optimise");
+
+    OptimiseStats stats;
+    try {
+        store2->optimiseStore(true, stats);
+    } catch (...) {
+        showOptimiseStats(stats);
+        throw;
+    }
+    showOptimiseStats(stats);
+}
+
+
 /* Scan the arguments; find the operation, set global flags, put all
    other flags in a list, and put all other arguments in another
    list. */
@@ -659,6 +703,8 @@ void run(Strings args)
             op = opInit;
         else if (arg == "--verify")
             op = opVerify;
+        else if (arg == "--optimise")
+            op = opOptimise;
         else if (arg == "--add-root") {
             if (i == args.end())
                 throw UsageError("`--add-root requires an argument");