about summary refs log tree commit diff
path: root/src/libstore/references.cc
diff options
context:
space:
mode:
authorEelco Dolstra <e.dolstra@tudelft.nl>2009-03-28T20·51+0000
committerEelco Dolstra <e.dolstra@tudelft.nl>2009-03-28T20·51+0000
commit6e946c8e72e0fd2baa58cab484a34debd2ae906d (patch)
treed088a2a99968dcf29d4dee95d06e229abe8faf3b /src/libstore/references.cc
parentc7152c8f97d01dda4eeb70869a0d28cc9a04df1f (diff)
* Scan for references and compute the SHA-256 hash of the output in
  one pass.  This halves the amount of I/O.

Diffstat (limited to 'src/libstore/references.cc')
-rw-r--r--src/libstore/references.cc115
1 files changed, 39 insertions, 76 deletions
diff --git a/src/libstore/references.cc b/src/libstore/references.cc
index bfb4f8d0a3..68d1cbfbfb 100644
--- a/src/libstore/references.cc
+++ b/src/libstore/references.cc
@@ -1,18 +1,10 @@
 #include "references.hh"
 #include "hash.hh"
 #include "util.hh"
+#include "archive.hh"
 
-#include <cerrno>
-#include <cstring>
-#include <cstdlib>
 #include <map>
 
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <unistd.h>
-#include <dirent.h>
-#include <fcntl.h>
-
 
 namespace nix {
 
@@ -20,8 +12,8 @@ namespace nix {
 static unsigned int refLength = 32; /* characters */
 
 
-static void search(size_t len, const unsigned char * s,
-    StringSet & ids, StringSet & seen)
+static void search(const unsigned char * s, unsigned int len, 
+    StringSet & hashes, StringSet & seen)
 {
     static bool initialised = false;
     static bool isBase32[256];
@@ -43,93 +35,60 @@ static void search(size_t len, const unsigned char * s,
             }
         if (!match) continue;
         string ref((const char *) s + i, refLength);
-        if (ids.find(ref) != ids.end()) {
+        if (hashes.find(ref) != hashes.end()) {
             debug(format("found reference to `%1%' at offset `%2%'")
                   % ref % i);
             seen.insert(ref);
-            ids.erase(ref);
+            hashes.erase(ref);
         }
         ++i;
     }
 }
 
 
-void checkPath(const string & path,
-    StringSet & ids, StringSet & seen)
+struct RefScanSink : Sink
 {
-    checkInterrupt();
-    
-    debug(format("checking `%1%'") % path);
+    HashSink hashSink;
+    StringSet hashes;
+    StringSet seen;
 
-    struct stat st;
-    if (lstat(path.c_str(), &st))
-        throw SysError(format("getting attributes of path `%1%'") % path);
+    string tail;
 
-    if (S_ISDIR(st.st_mode)) {
-        Strings names = readDirectory(path);
-	for (Strings::iterator i = names.begin(); i != names.end(); i++) {
-            search(i->size(), (const unsigned char *) i->c_str(), ids, seen);
-            checkPath(path + "/" + *i, ids, seen);
-        }
-    }
+    RefScanSink() : hashSink(htSHA256) { }
+    
+    void operator () (const unsigned char * data, unsigned int len);
+};
 
-    else if (S_ISREG(st.st_mode)) {
-        
-        AutoCloseFD fd = open(path.c_str(), O_RDONLY);
-        if (fd == -1) throw SysError(format("opening file `%1%'") % path);
 
-        size_t bufSize = 1024 * 1024;
-        assert(refLength <= bufSize);
-        unsigned char * buf = new unsigned char[bufSize];
+void RefScanSink::operator () (const unsigned char * data, unsigned int len)
+{
+    hashSink(data, len);
 
-        size_t left = st.st_size;
-        bool firstBlock = true;
-        
-        while (left > 0) {
-            checkInterrupt();
-            
-            size_t read = left > bufSize ? bufSize : left;
-            size_t copiedBytes = 0;
-
-            if (!firstBlock) {
-                /* Move the last (refLength - 1) bytes from the last
-                   block to the start of the buffer to deal with
-                   references that cross block boundaries. */
-                copiedBytes = refLength - 1;
-                if (read + copiedBytes > bufSize)
-                    read -= copiedBytes;
-                memcpy(buf, buf + (bufSize - copiedBytes), copiedBytes);
-            }
-            firstBlock = false;
+    /* It's possible that a reference spans the previous and current
+       fragment, so search in the concatenation of the tail of the
+       previous fragment and the start of the current fragment. */
+    string s = tail + string((const char *) data, len > refLength ? refLength : len);
+    search((const unsigned char *) s.c_str(), s.size(), hashes, seen);
 
-            readFull(fd, buf + copiedBytes, read);
-            left -= read;
+    search(data, len, hashes, seen);
 
-            search(copiedBytes + read, buf, ids, seen);
-        }
-        
-        delete[] buf; /* !!! autodelete */
-    }
-    
-    else if (S_ISLNK(st.st_mode)) {
-        string target = readLink(path);
-        search(target.size(), (const unsigned char *) target.c_str(), ids, seen);
-    }
-    
-    else throw Error(format("unknown file type: %1%") % path);
+    unsigned int tailLen = len <= refLength ? len : refLength;
+    tail =
+        string(tail, tail.size() < refLength - tailLen ? 0 : tail.size() - (refLength - tailLen)) +
+        string((const char *) data + len - tailLen, tailLen);
 }
 
 
-PathSet scanForReferences(const string & path, const PathSet & paths)
+PathSet scanForReferences(const string & path,
+    const PathSet & refs, Hash & hash)
 {
+    RefScanSink sink;
     std::map<string, Path> backMap;
-    StringSet ids;
-    StringSet seen;
 
     /* For efficiency (and a higher hit rate), just search for the
        hash part of the file name.  (This assumes that all references
        have the form `HASH-bla'). */
-    for (PathSet::const_iterator i = paths.begin(); i != paths.end(); i++) {
+    foreach (PathSet::const_iterator, i, refs) {
         string baseName = baseNameOf(*i);
         string::size_type pos = baseName.find('-');
         if (pos == string::npos)
@@ -138,21 +97,25 @@ PathSet scanForReferences(const string & path, const PathSet & paths)
         assert(s.size() == refLength);
         assert(backMap.find(s) == backMap.end());
         // parseHash(htSHA256, s);
-        ids.insert(s);
+        sink.hashes.insert(s);
         backMap[s] = *i;
     }
 
-    checkPath(path, ids, seen);
+    /* Look for the hashes in the NAR dump of the path. */
+    dumpPath(path, sink);
 
+    /* Map the hashes found back to their store paths. */
     PathSet found;
-    for (StringSet::iterator i = seen.begin(); i != seen.end(); i++) {
+    foreach (StringSet::iterator, i, sink.seen) {
         std::map<string, Path>::iterator j;
         if ((j = backMap.find(*i)) == backMap.end()) abort();
         found.insert(j->second);
     }
 
+    hash = sink.hashSink.finish();
+        
     return found;
 }
 
- 
+
 }